#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
The :mod:`ml_research_toolkit.datasets.uci` contains classes:
- :class:`ml_research_toolkit.datasets.uci.UCI`
"""
from __future__ import print_function
__docformat__ = 'restructuredtext'
import requests
import zipfile
import logging
import base64
import shutil
import time
import ssl
import os
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import pandas as pd
def read_csv(path, featuresize):
data = None
for sep in [',', ';', '\t']:
data = pd.read_csv(path, sep=sep)
if len(data.columns) == featuresize + 1:
break
else:
data = None
return data
def _filededup(_type, folder):
r"""
:return:
:rtype:
"""
files = os.listdir(folder)
list_of_related_files = []
for file in files:
tag = file.split('.')[-1]
if tag == _type:
list_of_related_files.append(
(file,
os.path.getsize(os.path.join(folder, file))))
list_of_related_files = sorted(
list_of_related_files, key=lambda x: x[1])
for i in range(len(list_of_related_files)-1):
os.remove(
os.path.join(folder, list_of_related_files[i][0]))
def _fileclean(folder, _types = []):
r"""
:return:
:rtype:
"""
files = os.listdir(folder)
for file in files:
tag = file.split('.')[-1]
if tag not in _types:
delpath = os.path.join(folder, file)
if os.path.isdir(delpath):
shutil.rmtree(delpath)
else:
os.remove(delpath)
[docs]class UCI(object):
r"""
Class for UCI datasets manager system.
"""
def __init__(self,
enforce=False,
cache='.uci',
url="https://archive.ics.uci.edu/ml/datasets.php"):
r"""
:param enforce: Rewrite all information in the cache
:type enforce: bool
:param cache: Cache directory for meta and dataset saving.
:type cache: str
:param url: URL for archive.ics.uci.edu can be changed during the time.
:type url: str
"""
self._cache = cache
self._url = url
self._enforce = enforce
self._meta_dir = 'metas'
self._dataset_dir = 'datasets'
self._uci_data_folder = 'machine-learning-databases'
if self._cache and not os.path.exists(self._cache):
os.makedirs(self._cache)
os.makedirs(os.path.join(self._cache, self._meta_dir))
os.makedirs(os.path.join(self._cache, self._dataset_dir))
if self._cache:
self._cache = os.path.abspath('.uci')
self._meta = None
def _download_meta(self):
try:
datasets = pd.read_html(self._url)
except Exception as e:
logging.warning(
"Could not read the table from UCI ML portal, Sorry!")
raise e
df = datasets[5]
nrows = df.shape[0]
ncols = df.shape[1]
df = df.iloc[1:]
df.columns = [
"Name",
"Data Types",
"Task",
"Feature Types",
"Samplesize",
"Featuresize",
"Year",
]
# добавляем ссылки на выборки
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
uh = urllib.request.urlopen(self._url, context=ctx)
html = uh.read()
soup = BeautifulSoup(html, "html5lib")
urls = dict()
for tag in soup.find_all("a"):
if isinstance(tag.contents[0], str):
urls[tag.contents[0].strip()] = tag.get_attribute_list(
'href')[0]
url_lst = []
relevant_names = df['Name'].values
for name in relevant_names:
if name in urls:
url = '/'.join([
'/'.join(self._url.split('/')[:-1]),
urls[name]
])
else:
url = None
url_lst.append(url)
df['URL'] = url_lst
df.dropna(inplace=True)
# фиксим мультитаск
df['Task'] = list(
map(lambda x: x.replace(' ', ''), df['Task'].values))
df['Data Types'] = list(
map(lambda x: x.replace(' ', ''), df['Data Types'].values))
df['Feature Types'] = list(
map(lambda x: x.replace(' ', ''), df['Feature Types'].values))
df = df.drop_duplicates()
df.index = list(range(len(df)))
return df
def _get_meta(self, enforce=None):
if enforce is None:
enforce = self._enforce
df = None
cache_file = os.path.join(self._cache, self._meta_dir, 'meta.csv')
if os.path.exists(cache_file) and not enforce:
df = pd.read_csv(cache_file, sep=';', header=0)
else:
df = self._download_meta()
df.to_csv(cache_file, index=False, sep=';')
self._meta = df
return self._meta
def _download_dataset(self, name):
ID = base64.b64encode(name.encode("UTF-8")).decode("UTF-8")
cache_dir = os.path.join(self._cache, self._dataset_dir, ID)
url = self._meta.loc[self._meta['Name'] == name]['URL'].values[0]
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
uh = urllib.request.urlopen(url, context=ctx)
html = uh.read()
soup = BeautifulSoup(html, "html5lib")
database_id = None
for tag in soup.find_all("a"):
sp_href = tag.get_attribute_list('href')[0].split('/')
if self._uci_data_folder in sp_href:
database_id = sp_href[sp_href.index(self._uci_data_folder)+1]
url = os.path.join(self._url, )
url = '/'.join([
'/'.join(self._url.split('/')[:-1]),
self._uci_data_folder,
database_id
])
uh = urllib.request.urlopen(url, context=ctx)
html = uh.read().decode()
soup = BeautifulSoup(html, "html5lib")
links = []
for link in soup.find_all("a"):
links.append(link.attrs["href"])
links_to_download = []
for i in range(len(links)):
links_to_download.append(os.path.join(url, str(links[i])))
try:
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
for link in links_to_download:
if not os.path.basename(link):
continue
filename = os.path.join(cache_dir, os.path.basename(link))
r = requests.get(link, stream=True)
with open(filename, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
_type = os.path.basename(filename).split('.')[-1]
if _type == 'zip':
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(cache_dir)
os.remove(filename)
_filededup('csv', cache_dir)
_filededup('data', cache_dir)
_filededup('names', cache_dir)
_filededup('train', cache_dir)
_fileclean(cache_dir, _types=['csv', 'data', 'train', 'names'])
except Exception as e:
shutil.rmtree(cache_dir)
logging.warning(
f'Something wrong with dataset `{name}` downloading {str(e)}')
def _get_dataset(self, name, enforce=None, download=True):
if enforce is None:
enforce = self._enforce
ID = base64.b64encode(name.encode("UTF-8")).decode("UTF-8")
dataset = dict()
temp = self._meta.loc[self._meta['Name'] == name].to_dict().items()
dataset['meta'] = {key: list(item.values())[0] for key, item in temp}
dataset['data'] = None
cache_dir = os.path.join(self._cache, self._dataset_dir, ID)
if not (os.path.exists(cache_dir) and not enforce) and download:
self._download_dataset(name)
try:
files = os.listdir(cache_dir)
tags = list(map(lambda x: x.split('.')[-1], files))
if 'data' in tags:
dataset['data'] = read_csv(
os.path.join(cache_dir, files[tags.index('data')]),
dataset['meta']['Featuresize'])
if 'names' in tags:
with open(
os.path.join(
cache_dir, files[tags.index('names')])) as f:
dataset['names'] = f.read()
elif 'csv' in tags:
dataset['data'] = read_csv(
os.path.join(cache_dir, files[tags.index('csv')]),
dataset['meta']['Featuresize'])
except Exception as e:
logging.warning(
f'Something wrong with dataset `{name}` reading {str(e)}')
return dataset
[docs] def get_dataset(self, name, enforce=None, download=True):
r"""
Get dataset by given name.
:param name: Name of the dataset for downloading.
:type name: str
:param enforce: Enforce rewriting all metadata in
the cache or use cached data.
Default None: use initial value.
:type enforce: bool
:param download: Download dataset or not.
If False and dataset is not presented than return None.
:type download: bool
"""
if self._meta is None:
self._get_meta()
return self._get_dataset(name, enforce=enforce, download=download)
def __len__(self):
return len(self._meta)
def __iter__(self):
def generator():
for name in datasets._meta['Name'].values:
yield self._get_dataset(name)
return generator()