diff --git a/.gitignore b/.gitignore index 5558556e9..17b6d82f8 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/api/ docs/rdt.rst docs/rdt.*.rst docs/modules.rst @@ -109,6 +110,3 @@ ENV/ # Vim .*.swp - -# demo data -examples/data/airbnb diff --git a/.travis.yml b/.travis.yml index fe86cdc92..3ed7bca0b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,39 +1,21 @@ # Config file for automatic testing at travis-ci.org +dist: xenial language: python python: + - 3.7 - 3.6 - 3.5 # Command to install dependencies -install: pip install -U tox-travis +install: pip install -U tox-travis codecov # Command to run tests script: tox -deploy: +after_success: codecov - # Automatically deploy releases to PyPI for each tagged commit - # Assuming you have installed the travis-ci CLI tool, after you - # create the Github repo and add it to Travis, run the - # following command to finish PyPI deployment setup: - # $ travis encrypt MY_PYPI_PASSWORD - # and then copy the (really long) encrypted password as shown. - # - provider: pypi - # distributions: sdist bdist_wheel - # user: dai_lab_mit - # password: - # secure: "TnqNnauWBoQXaq+S0i2sBxSyKWfn9I2+b2p5le8Y19AkZ5JI+1dHBrZU+SoI7DwLXqfudgVLke2bLStVS+T1b5UOpdABaHEkVKzYwlQRD8AXh/844HCJ3SwZpqmRidcu1f7mlEtvzq9Q3x8lNuXpY1qTHaEq7BKs/ZsInLfqvSWz7RmzYrv/VW27ELwWmQq8d9U/Uw0Ww/FyjxGiep4qf0pkTuZWWn1Og6RJMZ9uB/slfrdzlLuAdTc2Evml3nrNCm3gzpSe8/9jchHbqq+3XRnOamkBF984vROasEGv2rxM5W1xII6N4jA8MNz0PQu0yAiXcl8M58d9oFXSDaX2hfCeYUQPGMCi0upxyIYsOgW9gz5H/fe45dgjsMg9FxJsUBcGNAJHkT0z2rRWKtauGQnhWZFGannc54OzbY1BfMF7Af4jFaz7i6OoNUjKgJdRnShfwCpxAb76HPd4myK/yLG9n/vMFRdF0fnrPVRRFDdsEpjoYUFSdnnTnAasWNaJ8Xt4yYk1M3LeBgo55MajDMyKP9ENHBEEZbgvpjIsHczeZLXUCDqUcED1PHmqA+z4oX08ntUuWK9FD7G3AbbvllsGqJK7IYfuX8PL77OfioDZYYwNqlGaiTdLEK1j7IkiCSkpmODVJznUAYgnJu2u2rSvQ1xfWkrzykmYQ0GYcLw=" - # on: - # tags: true - # branch: stable - # repo: HDI-Project/RDT - # python: 3.6 +deploy: - # Automatically build and deploy documentation to GitHub Pages after every - # commit - # Follow the instructions at https://docs.travis-ci.com/user/deployment/pages/ - # to setup a personal deployment token and then provide it as a secure - # environment variable at https://travis-ci.org/HDI-Project/RDT/settings - provider: pages skip-cleanup: true github-token: "$GITHUB_TOKEN" @@ -42,4 +24,4 @@ deploy: target-branch: gh-pages on: branch: stable - python: 3.6 + python: 3.7 diff --git a/AUTHORS.rst b/AUTHORS.rst index 9fc586349..898e412b3 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -2,9 +2,8 @@ Credits ======= -Contributors ------------- - -* Andrew Montanez +* Manuel Alvarez +* Carles Sala +* José David Pérez * Kalyan Veeramachaneni -* Manuel Alvarez \ No newline at end of file +* Andrew Montanez diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index c33080108..4cc4b8b2c 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -180,7 +180,7 @@ The process of releasing a new version involves several steps combining both ``g these changes are committed and available in ``master`` branch. Normally this is just a list of the Pull Requests that have been merged since the latest version. -Once this is done, just run the following commands:: +Once this is done, just run the following commands: 1. If you are releasing a patch version:: diff --git a/HISTORY.md b/HISTORY.md index 66a207a05..e64b1ab80 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,20 @@ # History +## 0.1.3 - 2019-09-24 + +### New Features + +* Add attributes NullTransformer and col_meta. - Issue [#30](https://github.com/HDI-Project/RDT/issues/30) by @ManuelAlvarezC + +### General Improvements + +* Integrate with CodeCov - Issue [#89](https://github.com/HDI-Project/RDT/issues/89) by @csala +* Remake Sphinx Documentation - Issue [#96](https://github.com/HDI-Project/RDT/issues/96) by @JDTheRipperPC +* Improve README - Issue [#92](https://github.com/HDI-Project/RDT/issues/92) by @JDTheRipperPC +* Document RELEASE workflow - Issue [#93](https://github.com/HDI-Project/RDT/issues/93) by @JDTheRipperPC +* Add support to Python 3.7 - Issue [#38](https://github.com/HDI-Project/RDT/issues/38) by @ManuelAlvarezC +* Create way to pass HyperTransformer table dict - Issue [#45](https://github.com/HDI-Project/RDT/issues/45) by @ManuelAlvarezC + ## 0.1.2 * Add a numerical transformer for positive numbers. diff --git a/MANIFEST.in b/MANIFEST.in index 469520f58..718245942 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include CONTRIBUTING.rst include HISTORY.md include LICENSE include README.md +include RELEASE.md recursive-include tests * recursive-exclude * __pycache__ diff --git a/Makefile b/Makefile index 1c2c11fa8..f0f99eed6 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ clean-pyc: ## remove Python file artifacts .PHONY: clean-docs clean-docs: ## remove previously built docs + rm -f docs/api/*.rst -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed .PHONY: clean-coverage @@ -107,7 +108,7 @@ test: ## run tests quickly with the default Python .PHONY: test-all test-all: ## run tests on every Python version with tox - tox + tox -r .PHONY: coverage coverage: ## check code coverage quickly with the default Python @@ -121,8 +122,8 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs + sphinx-apidoc --separate -T -o docs/api/ rdt $(MAKE) -C docs html - touch docs/_build/html/.nojekyll .PHONY: view-docs view-docs: docs ## view docs in browser @@ -171,8 +172,8 @@ bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion-major: ## Bump the version the next major skipping the release bumpversion --no-tag major -CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) -CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l) +CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) .PHONY: check-release check-release: ## Check if the release can be made diff --git a/README.md b/README.md index 994ca6fe6..8a03f65b7 100644 --- a/README.md +++ b/README.md @@ -1,114 +1,173 @@

“Copulas” - An open source project from Data to AI Lab at MIT. +An open source project from Data to AI Lab at MIT.

-[![][pypi-img]][pypi-url] [![][travis-img]][travis-url] +[![PyPi Shield](https://img.shields.io/pypi/v/RDT.svg)](https://pypi.python.org/pypi/RDT) +[![Travis CI Shield](https://travis-ci.org/HDI-Project/RDT.svg?branch=master)](https://travis-ci.org/HDI-Project/RDT) +[![Coverage Status](https://codecov.io/gh/HDI-Project/RDT/branch/master/graph/badge.svg)](https://codecov.io/gh/HDI-Project/RDT) +[![Downloads](https://pepy.tech/badge/rdt)](https://pepy.tech/project/rdt) -# Reversible Data Transforms +# RDT: Reversible Data Transforms -This a python library used to transform data for data science libraries and preserve the transformations in order to reverse them as needed. - -- Free software: MIT license +- License: MIT - Documentation: https://HDI-Project.github.io/RDT +- Homepage: https://github.com/HDI-Project/RDT + +## Overview -[travis-img]: https://travis-ci.org/HDI-Project/RDT.svg?branch=master -[travis-url]: https://travis-ci.org/HDI-Project/RDT -[pypi-img]: https://img.shields.io/pypi/v/RDT.svg -[pypi-url]: https://pypi.python.org/pypi/RDT +**RDT** is a Python library used to transform data for data science libraries and preserve +the transformations in order to revert them as needed. +# Install +## Requirements -## Installation +**RDT** has been developed and tested on [Python 3.5, 3.6 and 3.7](https://www.python.org/downloads) -### Install with pip +Also, although it is not strictly required, the usage of a +[virtualenv](https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid +interfering with other software installed in the system where **RDT** is run. -The simplest and recommended way to install RDT is using `pip`: +These are the minimum commands needed to create a virtualenv using python3.6 for **RDT**: +```bash +pip install virtualenv +virtualenv -p $(which python3.6) rdt-venv ``` -pip install rdt + +Afterwards, you have to execute this command to have the virtualenv activated: + +```bash +source rdt-venv/bin/activate ``` -### Install from sources +Remember about executing it every time you start a new console to work on **RDT**! + +## Install with pip -You can also clone the repository and install it from sources +After creating the virtualenv and activating it, we recommend using +[pip](https://pip.pypa.io/en/stable/) in order to install **RDT**: +```bash +pip install rdt ``` -git clone git@github.com:HDI-Project/RDT.git + +This will pull and install the latest stable release from [PyPi](https://pypi.org/). + +## Install from sources + +Alternatively, with your virtualenv activated, you can clone the repository and install +it from source by running `make install` on the `stable` branch: + +```bash +git clone https://github.com/HDI-Project/RDT cd RDT -pip install -e . +git checkout stable +make install ``` -## Usage +For development, you can use `make install-develop` instead in order to install all +the required dependencies for testing and code linting. + +# Quickstart -This library is used to apply desired transformations to individual tables or entire datasets -all at once, with the goal of getting completely numeric tables as the output. The desired -transformations can be specified at the column level, or dataset level. For example, you can -apply a datetime transformation to only select columns, or you can specify that you want every -datetime column in the dataset to go through that transformation. +In this short series of tutorials we will guide you through a series of steps that will +help you getting started using **RDT** to transform columns, tables and datasets. -### Transforming a column +## Transforming a column -The base class of this library is the BaseTransformer class. This class provides method to fit -a transformer to your data and transform it, a method to transform new data with an already -fitted transformer and a method to reverse a transform and get data that looks like the original -input. Each transformer class inherits from the BaseTransformer class, and thus has all -these methods. +In this first guide, you will learn how to use **RDT** in its simplest form, transforming +a single column loaded as a `pandas.DataFrame` object. -Transformers take in a column and the meta data for that column as an input. Below we will -demonstrate how to use a datetime transformer to transform and reverse transform a column. +### 1. Load the column and its metadata -First we need to decompress the demo data included in the repository by running this -command on a shell: +In order to load a column and its metadata, you must call the `rdt.load_data` function passing +it the path to the metadata json file, the name of the table from which to load the column, +and the name of the column to load. + +You can find documentation about the metadata format in [MetaData.json]( +https://github.com/HDI-Project/MetaData.json). + +```python +from rdt import load_data + +metadata_path = 'tests/data/airbnb/airbnb_meta.json' + +column_data, column_metadata = load_data( + metadata_path=metadata_path, + table_name='users', + column_name='date_account_created', +) +``` + +The output will be the variable `column_data`, which is a `pandas.DataFrame` with the column data: ``` -tar -xvzf examples/data/airbnb.tar.gz -C examples/data/ + date_account_created +0 2014-01-01 +1 2014-01-01 +2 2014-01-01 +3 2014-01-01 +4 2014-01-01 ``` -Afterwards, we can proceed to open a python interpreter and load the data +And the `column_metadata`, which is a `dict` containing the information from the metadata json +that corresponds to this column: ```python ->>> from rdt.transfomers import get_col_info ->>> demo_data = 'examples/data/airbnb/Airbnb_demo_meta.json' ->>> column, column_metadata = get_col_info('users', 'date_account_created', demo_data) ->>> column.head(5) -0 2014-01-01 -1 2014-01-01 -2 2014-01-01 -3 2014-01-01 -4 2014-01-01 -Name: date_account_created, dtype: object - ->>> column_metadata -{'name': 'date_account_created', - 'type': 'datetime', - 'format': '%Y-%m-%d', - 'uniques': 1634} +{ + 'name': 'date_account_created', + 'type': 'datetime', + 'format': '%Y-%m-%d', + 'uniques': 1634 +} +``` +### 2. Load the transformer + +In this case the column is a datetime, so we will use the `DTTransformer`. + +```python +from rdt.transformers import DTTransformer +transformer = DTTransformer(column_metadata) ``` -Now we can transform the column. +### 3. Transform the column data + +In order to transform the data, we will call its `fit_transform` method passing the +`column` data: ```python ->>> from rdt.transformers.DTTransformer import DTTransformer ->>> transformer = DTTransformer() ->>> transformed_data = transformer.fit_transform(column, column_metadata) ->>> transformed_data.head(5) -0 1 1.388531e+18 -1 1 1.388531e+18 -2 1 1.388531e+18 -3 1 1.388531e+18 -4 1 1.388531e+18 +transformed_data = transformer.fit_transform(column_data) +``` +The output will be another `pandas.DataFrame` with the transformed data: + +``` + date_account_created +0 1.388534e+18 +1 1.388534e+18 +2 1.388534e+18 +3 1.388534e+18 +4 1.388534e+18 ``` -If you want to reverse the transformation and get the original data back, you can run the -following command. +### 4. Revert the column transformation + +In order to revert the previous transformation, the transformed data can be passed to +the `reverse_transform` method of the transformer: ```python ->>> reverse_transformed = transformer.reverse_transform(transformed_data, column_metadata) ->>> reverse_transformed.head(5) - date_account_created +reversed_data = transformer.reverse_transform(transformed_data) +``` + +The output will be a `pandas.DataFrame` containing the data from which the transformed data +was generated with. + +In this case, of course, the obtained data should be identical to the original one: + +``` date_account_created 0 2014-01-01 1 2014-01-01 @@ -117,162 +176,119 @@ following command. 4 2014-01-01 ``` -### Transforming a table +## Transforming a table -You can also transform an entire table using the HyperTransformer class. Again, we can start by -loading the data. +Once we know how to transform a single column, we can try to go the next level and transform +a table with multiple columns. + +### 1. Load the table data and its metadata + +In order to load a complete table, we will use the same `rdt.load_data` function as before, +but omit the `column_name` from the call. ```python ->>> from rdt.utils import get_table_dict ->>> meta_file = 'examples/data/airbnb/Airbnb_demo_meta.json' ->>> table_dict = get_table_dict(meta_file) ->>> table, table_meta = table_dict['users'] +table_data, table_metadata = load_data( + metadata_path=metadata_path, + table_name='users', +) +``` + +The output, like before will be compsed by the `table_data`, which in this case will contain +all the columns from the table: + +``` + id date_account_created timestamp_first_active ... signup_app first_device_type first_browser +0 d1mm9tcy42 2014-01-01 20140101000936 ... Web Windows Desktop Chrome +1 yo8nz8bqcq 2014-01-01 20140101001558 ... Web Mac Desktop Firefox +2 4grx6yxeby 2014-01-01 20140101001639 ... Web Windows Desktop Firefox +3 ncf87guaf0 2014-01-01 20140101002146 ... Web Windows Desktop Chrome +4 4rvqpxoh3h 2014-01-01 20140101002619 ... iOS iPhone -unknown- ``` -Now you can pass a list of the desired transformers into the `fit_transform_table` function to -transform the whole table. +And the `table_metadata`, which will also contain all the information available about the table: ```python ->>> from rdt.hyper_transformer import HyperTransformer ->>> ht = HyperTransformer(meta_file) ->>> tl = ['DTTransformer', 'NumberTransformer', 'CatTransformer'] ->>> transformed = ht.fit_transform_table(table, table_meta, transformer_list=tl) ->>> transformed.head(3).T - 0 1 2 -?date_account_created 1.000000e+00 1.000000e+00 1.000000e+00 -date_account_created 1.388531e+18 1.388531e+18 1.388531e+18 -?timestamp_first_active 1.000000e+00 1.000000e+00 1.000000e+00 -timestamp_first_active 1.654000e+13 1.654000e+13 1.654000e+13 -?date_first_booking 1.000000e+00 0.000000e+00 0.000000e+00 -date_first_booking 1.388790e+18 0.000000e+00 0.000000e+00 -?gender 1.000000e+00 1.000000e+00 1.000000e+00 -gender 8.522112e-01 3.412078e-01 1.408864e-01 -?age 1.000000e+00 0.000000e+00 0.000000e+00 -age 6.200000e+01 3.700000e+01 3.700000e+01 -?signup_method 1.000000e+00 1.000000e+00 1.000000e+00 -signup_method 3.282037e-01 3.500181e-01 4.183867e-01 -?signup_flow 1.000000e+00 1.000000e+00 1.000000e+00 -signup_flow 4.453093e-01 3.716032e-01 3.906801e-01 -?language 1.000000e+00 1.000000e+00 1.000000e+00 -language 2.927157e-01 5.682538e-01 6.622744e-01 -?affiliate_channel 1.000000e+00 1.000000e+00 1.000000e+00 -affiliate_channel 9.266169e-01 5.640470e-01 8.044208e-01 -?affiliate_provider 1.000000e+00 1.000000e+00 1.000000e+00 -affiliate_provider 7.717574e-01 2.539509e-01 7.288847e-01 -?first_affiliate_tracked 1.000000e+00 1.000000e+00 1.000000e+00 -first_affiliate_tracked 3.861429e-01 8.600605e-01 4.029200e-01 -?signup_app 1.000000e+00 1.000000e+00 1.000000e+00 -signup_app 6.915504e-01 6.373492e-01 5.798949e-01 -?first_device_type 1.000000e+00 1.000000e+00 1.000000e+00 -first_device_type 6.271052e-01 2.611754e-01 6.828802e-01 -?first_browser 1.000000e+00 1.000000e+00 1.000000e+00 -first_browser 2.481743e-01 5.087636e-01 5.023412e-01 +{ + 'path': 'users_demo.csv', + 'name': 'users', + 'use': True, + 'headers': True, + 'fields': [ + { + 'name': 'id', + 'type': 'id', + 'regex': '^.{10}$', + 'uniques': 213451 + }, + ... + { + 'name': 'first_browser', + 'type': 'categorical', + 'subtype': 'categorical', + 'uniques': 52 + } + ], + 'primary_key': 'id', + 'number_of_rows': 213451 +} +``` +### 2. Load the HyperTransformer + +In order to manuipulate a complete table we will need to import the `rdt.HyperTransformer` class +and create an instance of it passing it the path to our metadata file. + +```python +from rdt import HyperTransformer +ht = HyperTransformer(metadata=metadata_path) ``` -You can then reverse transform the output to get a table in the original format, but it will -only contain the columns corresponding to those that were transformed (ie. numeric columns). +### 3. Transform the table data + +In order to transform the data, we will call the `fit_transform_table` method from our +`HyperTransformer` instance passing it the table data, the table metadata and the names of the +transformers that we want to apply. ```python ->>> reverse_transformed = ht.reverse_transform_table(transformed, table_meta) ->>> reverse_transformed.head(3).T - 0 1 2 -date_account_created 2014-01-01 2014-01-01 2014-01-01 -timestamp_first_active 19700101053540 19700101053540 19700101053540 -date_first_booking 2014-01-04 NaN NaN -gender MALE -unknown- -unknown- -age 62 NaN NaN -signup_method basic basic basic -signup_flow 0 0 0 -language en en en -affiliate_channel sem-non-brand direct sem-brand -affiliate_provider google direct google -first_affiliate_tracked omg untracked omg -signup_app Web Web Web -first_device_type Windows Desktop Mac Desktop Windows Desktop -first_browser Chrome Firefox Firefox +transformed = ht.fit_transform_table( + table=table_data, + table_meta=table_metadata, + transformer_list=['DTTransformer', 'NumberTransformer', 'CatTransformer'] +) +``` +The output, again, will be the transformed data: + +``` + id date_account_created timestamp_first_active ... signup_app first_device_type first_browser +0 0.512195 1.388534e+18 1.388535e+18 ... 0.204759 0.417261 0.423842 +1 0.958701 1.388534e+18 1.388535e+18 ... 0.569893 0.115335 0.756304 +2 0.106468 1.388534e+18 1.388535e+18 ... 0.381164 0.571280 0.869942 +3 0.724346 1.388534e+18 1.388536e+18 ... 0.485542 0.668070 0.364122 +4 0.345691 1.388534e+18 1.388536e+18 ... 0.944064 0.847751 0.108216 ``` -### Transforming a dataset +### 4. Revert the table transformation -The hyper transformer is also capable of transforming all of the tables specified in your -meta.json at once. +In order to revert the transformation and recover the original data from the transformed one, +we need to call `reverse_transform_table` of the `HyperTransformer` instance passing it the +transformed data and the table metadata. ```python ->>> from rdt.hyper_transformer import HyperTransformer ->>> meta_file = 'examples/data/airbnb/Airbnb_demo_meta.json' ->>> ht = HyperTransformer(meta_file) ->>> tl = ['DTTransformer', 'NumberTransformer', 'CatTransformer'] ->>> transformed = ht.fit_transform(transformer_list=tl) ->>> transformed['users'].head(3).T - 0 1 2 -?date_account_created 1.000000e+00 1.000000e+00 1.000000e+00 -date_account_created 1.388531e+18 1.388531e+18 1.388531e+18 -?timestamp_first_active 1.000000e+00 1.000000e+00 1.000000e+00 -timestamp_first_active 1.654000e+13 1.654000e+13 1.654000e+13 -?date_first_booking 1.000000e+00 0.000000e+00 0.000000e+00 -date_first_booking 1.388790e+18 0.000000e+00 0.000000e+00 -?gender 1.000000e+00 1.000000e+00 1.000000e+00 -gender 9.061832e-01 1.729590e-01 4.287514e-02 -?age 1.000000e+00 0.000000e+00 0.000000e+00 -age 6.200000e+01 3.700000e+01 3.700000e+01 -?signup_method 1.000000e+00 1.000000e+00 1.000000e+00 -signup_method 5.306912e-01 4.082081e-01 3.028973e-01 -?signup_flow 1.000000e+00 1.000000e+00 1.000000e+00 -signup_flow 4.597129e-01 4.751324e-01 5.495054e-01 -?language 1.000000e+00 1.000000e+00 1.000000e+00 -language 2.947847e-01 4.170684e-01 5.057820e-01 -?affiliate_channel 1.000000e+00 1.000000e+00 1.000000e+00 -affiliate_channel 9.213130e-01 4.712533e-01 8.231925e-01 -?affiliate_provider 1.000000e+00 1.000000e+00 1.000000e+00 -affiliate_provider 7.649791e-01 2.028804e-01 7.174262e-01 -?first_affiliate_tracked 1.000000e+00 1.000000e+00 1.000000e+00 -first_affiliate_tracked 3.716114e-01 6.723371e-01 3.710109e-01 -?signup_app 1.000000e+00 1.000000e+00 1.000000e+00 -signup_app 3.583918e-01 2.627690e-01 4.544640e-01 -?first_device_type 1.000000e+00 1.000000e+00 1.000000e+00 -first_device_type 6.621950e-01 3.078130e-01 7.152115e-01 -?first_browser 1.000000e+00 1.000000e+00 1.000000e+00 -first_browser 2.410379e-01 4.766930e-01 4.865389e-01 - ->>> transformed['sessions'].head(3).T - 0 1 2 -?action 1.000000 1.000000 1.000000 -action 0.361382 0.597891 0.353806 -?action_type 1.000000 1.000000 1.000000 -action_type 0.089913 0.560351 0.046400 -?action_detail 1.000000 1.000000 1.000000 -action_detail 0.070212 0.852246 0.107477 -?device_type 1.000000 1.000000 1.000000 -device_type 0.726447 0.711231 0.710298 -?secs_elapsed 1.000000 1.000000 1.000000 -secs_elapsed 319.000000 67753.000000 301.000000 - ->>> reverse_transformed = ht.reverse_transform(tables=transformed) ->>> reverse_transformed['users'].head(3).T - 0 1 2 -date_account_created 2014-01-01 2014-01-01 2014-01-01 -timestamp_first_active 19700101053540 19700101053540 19700101053540 -date_first_booking 2014-01-04 NaN NaN -gender MALE -unknown- -unknown- -age 62 NaN NaN -signup_method basic basic basic -signup_flow 0 0 0 -language en en en -affiliate_channel sem-non-brand direct sem-brand -affiliate_provider google direct google -first_affiliate_tracked omg untracked omg -signup_app Web Web Web -first_device_type Windows Desktop Mac Desktop Windows Desktop -first_browser Chrome Firefox Firefox - ->>> reverse_transformed['sessions'].head(3).T - 0 1 2 -action lookup search_results lookup -action_type None click None -action_detail None view_search_results None -device_type Windows Desktop Windows Desktop Windows Desktop -secs_elapsed 319 67753 301 +reversed_data = ht.reverse_transform_table( + table=transformed, + table_meta=table_metadata +) +``` + +The output will be the reversed data. Just like before, this should look exactly like the +original data: +``` + id date_account_created timestamp_first_active ... signup_app first_device_type first_browser +0 d1mm9tcy42 2014-01-01 20140101010936 ... Web Windows Desktop Chrome +1 yo8nz8bqcq 2014-01-01 20140101011558 ... Web Mac Desktop Firefox +2 4grx6yxeby 2014-01-01 20140101011639 ... Web Windows Desktop Firefox +3 ncf87guaf0 2014-01-01 20140101012146 ... Web Windows Desktop Chrome +4 4rvqpxoh3h 2014-01-01 20140101012619 ... iOS iPhone -unknown- ``` diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 000000000..a91e5ff6e --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,232 @@ +# Release workflow + +The process of releasing a new version involves several steps: + +1. [Install RDT from source](#install-rdt-from-source) + +2. [Linting and tests](#linting-and-tests) + +3. [Documentation](#documentation) + +4. [Milestone](#milestone) + +5. [HISTORY.md](#history.md) + +6. [Distribution](#distribution) + +7. [Integration with SDV](#integration-with-sdv) + +7.1. [Install SDV from source](#install-sdv-from-source) + +7.2. [Install from distribution](#install-from-distribution) + +7.3. [Run SDV tests and README.md examples](#run-sdv-tests-and-readme.md-examples) + +8. [Making the release](#making-the-release) + +8.1. [Tag and release to PyPi](#tag-and-release-to-pypi) + +8.2. [Update the release on GitHub](#update-the-release-on-github) + +## Install RDT from source + +Clone the project and install the development requirements before start the release process. Alternatively, with your virtualenv activated. + +```bash +git clone https://github.com/HDI-Project/RDT +cd RDT +git checkout master +make install-develop +``` + +## Linting and tests + +Execute ALL the tests and linting, tests must end with no errors: + +```bash +make test-all +``` + +This command will use tox to execute the unittests with different environments, see tox.ini configuration. + +To be able to run this you will need the different python versions used in the tox.ini file. + +At the end, you will see an output like this: + +``` +_____________________________________________ summary ______________________________________________ + py35: commands succeeded + py36: commands succeeded + lint: commands succeeded + docs: commands succeeded +``` + +To run the tests over your python version: + +```bash +make test && make lint +``` + +And you will see something like this: + +``` +================================== 41 passed, 5 skipped in 1.29s =================================== +flake8 rdt tests examples +isort -c --recursive rdt tests examples +``` + +The execution has finished with no errors and 5 tests skipped. + +## Documentation + +The documentation must be up to date and generated with: + +```bash +make view-docs +``` + +Read the documentation to ensure all the changes are reflected in the documentation. + +Alternatively, you can simply generate the documentation using the command: + +```bash +make docs +``` + +## Milestone + +It's important check that the git hub and milestone issues are up to date with the release. + +You need to check that: + +* The milestone for the current release exists. + +* All the issues closed since the latest release are associated to the milestone. If they are not, associate them. + +* All the issues associated to the milestone are closed. If there are open issues but the milestone needs to be released anyway, move them to the next milestone. + +* All the issues in the milestone are assigned to at least one person. + +* All the pull requests closed since the latest release are associated to an issue. If necessary, create issues and assign them to the milestone. Also assigne the person who opened the issue to them. + +## HISTORY.md + +Make sure HISTORY.md is updated with the issues of the milestone: + +``` +# History + +## X.Y.Z (YYYY-MM-DD) + +### New Features + +* - [Issue #](https://github.com/HDI-Project/RDT/issues/) by @resolver + +### General Improvements + +* - [Issue #](https://github.com/HDI-Project/RDT/issues/) by @resolver + +### Bug Fixed + +* - [Issue #](https://github.com/HDI-Project/RDT/issues/) by @resolver +``` + +The issue list per milestone can be found [here][milestones]. + +[milestones]: https://github.com/HDI-Project/RDT/milestones + +## Distribution + +Generate the distribution executing: + +```bash +make dist +``` + +This will create a `dist` and `build` directories. The `dist` directory contains the library installer. + +``` +dist/ +├── rdt--py2.py3-none-any.whl +└── rdt-.tar.gz +``` + +Now, create a new virtualenv with the distributed file generated and run the README.md examples: + +1. Create the rdt-test directory (out of the RDT directory): + +```bash +mkdir rdt-test +cd rdt-test +``` + +2. Create a new virtuelenv and activate it: + +```bash +virtualenv -p $(which python3.6) .venv +source .venv/bin/activate +``` + +3. Install the wheel distribution: + +```bash +pip install /path/to/rdt/dist/.whl +``` + +4. Now you are ready to execute the README.md examples. + +## Integration with SDV + +### Install SDV from source + +Clone the project and install the development requirements. Alternatively, with your virtualenv activated. + +```bash +git clone https://github.com/HDI-Project/SDV +cd SDV +git checkout master +make install-develop +``` + +### Install from distribution + +Install the RDT version from the generated distribution file. + +```bash +pip install /path/to/rdt/dist/.whl +``` + +### Run SDV tests and README.md examples + +Execute the SDV tests to ensure that the new distribution version works. + +```bash +make test +``` + +Also, execute the SDV README.md examples. + +## Making the release + +At the end, we need to make the release. First, check if the release can be made: + +```bash +make check-release +``` + + +### Tag and release to PyPi + +Once we are sure that the release can be made we can use different commands depending on the type of release that we want to make: + +* `make release`: This will relase a patch, which is the most common type of release. Use this when the changes are bugfixes or enhancements that do not modify the existing user API. Changes that modify the user API to add new features but that do not modify the usage of the previous features can also be released as a patch. + +* `make release-minor`: This will release the next minor version. Use this if the changes modify the existing user API in any way, even if it is backwards compatible. Minor backwards incompatible changes can also be released as minor versions while the library is still in beta state. After the major version 1 has been released, minor version can only be used to add backwards compatible API changes. + +* `make release-major`: This will release the next major version. Use this to if the changes modify the user API in a backwards incompatible way after the major version 1 has been released. + +### Update the release on GitHub + +Once the tag and the release to PyPi has been made, go to GitHub and edit the freshly created "tag" to add the title and release notes, which should be exactly the same that we added to the HISTORY.md file. + +Finaly, close the milestone and, if it does not exit, create the next one. diff --git a/docs/conf.py b/docs/conf.py old mode 100755 new mode 100644 index 72d3e3cfc..77e5230a2 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# rdt documentation build configuration file, created by +# RDT documentation build configuration file, created by # sphinx-quickstart on Fri Jun 9 13:47:02 2017. # # This file is execfile()d with the current directory set to its @@ -17,15 +17,8 @@ # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -# -import os -import sys import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -from recommonmark.transform import AutoStructify - -sys.path.insert(0, os.path.abspath('..')) import rdt @@ -38,11 +31,18 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ + 'm2r', 'sphinx.ext.autodoc', + 'sphinx.ext.githubpages', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', + 'autodocsumm', ] +autodoc_default_options = { + 'autosummary': True, +} + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -50,17 +50,17 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md'] -source_parsers = { - '.md': CommonMarkParser, -} - # The master toctree document. master_doc = 'index' # General information about the project. -project = u'RDT' -copyright = u"2018, MIT Data To AI Lab" -author = u"MIT Data To AI Lab" +project = 'RDT' +slug = 'rdt' +title = project + ' Documentation', +copyright = '2019, MIT Data To AI Lab' +author = 'MIT Data To AI Lab' +description = 'Reversable data transforms.' +user = 'HDI-Project' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -81,7 +81,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' @@ -89,7 +89,6 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False - # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -101,8 +100,8 @@ # Readthedocs additions html_context = { 'display_github': True, - 'github_user': 'HDI-Project', - 'github_repo': 'rdt', + 'github_user': user, + 'github_repo': project, 'github_version': 'master', 'conf_py_path': '/docs/', } @@ -118,13 +117,22 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = 'images/dai-logo-white.ico' + +# If given, this must be the name of an image file (path relative to the +# configuration directory) that is the logo of the docs. It is placed at +# the top of the sidebar; its width should therefore not exceed 200 pixels. +html_logo = 'images/dai-logo-white-200.png' # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'rdtdoc' +htmlhelp_basename = slug + 'doc' # -- Options for LaTeX output ------------------------------------------ @@ -150,22 +158,26 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'rdt.tex', - u'RDT Documentation', - u'MIT Data To AI Lab', 'manual'), -] +latex_documents = [( + master_doc, + slug + '.tex', + title, + author, + 'manual' +)] # -- Options for manual page output ------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'rdt', - u'RDT Documentation', - [author], 1) -] +man_pages = [( + master_doc, + slug, + title, + [author], + 1 +)] # -- Options for Texinfo output ---------------------------------------- @@ -173,14 +185,12 @@ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'rdt', - u'RDT Documentation', - author, - 'rdt', - 'One line description of project.', - 'Miscellaneous'), -] - - - +texinfo_documents = [( + master_doc, + slug, + title, + author, + slug, + description, + 'Miscellaneous' +)] diff --git a/docs/history.rst b/docs/history.rst index 250649964..d26e5be83 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1 +1 @@ -.. include:: ../HISTORY.rst +.. mdinclude:: ../HISTORY.md diff --git a/docs/images/dai-logo-white-200.png b/docs/images/dai-logo-white-200.png new file mode 100644 index 000000000..58bb9fdec Binary files /dev/null and b/docs/images/dai-logo-white-200.png differ diff --git a/docs/images/dai-logo-white.ico b/docs/images/dai-logo-white.ico new file mode 100644 index 000000000..aab7ae8cf Binary files /dev/null and b/docs/images/dai-logo-white.ico differ diff --git a/docs/index.rst b/docs/index.rst index d6709728f..6c9ff5e6c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,14 +1,16 @@ -Welcome to RDT's documentation! -====================================== +.. include:: readme.rst .. toctree:: + :hidden: :maxdepth: 2 - :caption: Contents: - readme - installation - usage - modules + Overview + +.. toctree:: + :caption: Resources + :hidden: + + API Reference contributing authors history diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 533e6cf47..000000000 --- a/docs/installation.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. highlight:: shell - -============ -Installation -============ - - -Stable release --------------- - -To install RDT, run this command in your terminal: - -.. code-block:: console - - $ pip install rdt - -This is the preferred method to install RDT, as it will always install the most recent stable release. - -If you don't have `pip`_ installed, this `Python installation guide`_ can guide -you through the process. - -.. _pip: https://pip.pypa.io -.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ - - -From sources ------------- - -The sources for RDT can be downloaded from the `Github repo`_. - -You can either clone the public repository: - -.. code-block:: console - - $ git clone git://github.com/HDI-Project/RDT - -Or download the `tarball`_: - -.. code-block:: console - - $ curl -OL https://github.com/HDI-Project/RDT/tarball/master - -Once you have a copy of the source, you can install it with: - -.. code-block:: console - - $ python setup.py install - - -.. _Github repo: https://github.com/HDI-Project/RDT -.. _tarball: https://github.com/HDI-Project/RDT/tarball/master diff --git a/docs/readme.rst b/docs/readme.rst index 72a335581..97d495851 100644 --- a/docs/readme.rst +++ b/docs/readme.rst @@ -1 +1 @@ -.. include:: ../README.rst +.. mdinclude:: ../README.md diff --git a/docs/usage.rst b/docs/usage.rst deleted file mode 100644 index 30478e507..000000000 --- a/docs/usage.rst +++ /dev/null @@ -1,7 +0,0 @@ -===== -Usage -===== - -To use RDT in a project:: - - import rdt diff --git a/examples/airbnb.py b/examples/airbnb.py deleted file mode 100644 index 85e1bb1ba..000000000 --- a/examples/airbnb.py +++ /dev/null @@ -1,57 +0,0 @@ -import argparse -import os -import tarfile - -from rdt.hyper_transformer import HyperTransformer - - -def download_airbnb_dataset(bucket, dirname): - - if not os.path.exists(dirname): - os.makedirs(dirname) - - sessions_KEY = 'sdv-demo/sessions_demo.csv' - users_KEY = 'sdv-demo/users_demo.csv' - meta_KEY = 'sdv-demo/Airbnb_demo_meta.json' - - # bucket.download_file(sessions_KEY, os.path.join(dirname, 'sessions_demo.csv')) - # bucket.download_file(users_KEY, os.path.join(dirname, 'users_demo.csv')) - bucket.download_file(meta_KEY, os.path.join(dirname, 'Airbnb_demo_meta.json')) - - - -def get_bucket(bucket_name): - s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED)) - return s3.Bucket(bucket_name) - - -def run_airbnb_demo(data_dir): - """HyperTransfomer will transform back and forth data airbnb data.""" - - # Setup - meta_file = os.path.join(data_dir, 'Airbnb_demo_meta.json') - transformer_list = ['NumberTransformer', 'DTTransformer', 'CatTransformer'] - ht = HyperTransformer(meta_file) - - # Run - transformed = ht.fit_transform(transformer_list=transformer_list) - result = ht.reverse_transform(tables=transformed) - - # Check - assert result.keys() == ht.table_dict.keys() - - for name, table in result.items(): - assert not result[name].isnull().all().all() - - -if __name__ == '__main__': - - data_dir = os.path.join(os.path.dirname(__file__), 'data') - airbnb_dir = os.path.join(data_dir, 'airbnb') - - if not os.path.exists(airbnb_dir): - tar_name = airbnb_dir + '.tar.gz' - with tarfile.open(tar_name, mode='r:gz') as tf: - tf.extractall(data_dir) - - run_airbnb_demo(airbnb_dir) diff --git a/examples/data/airbnb.tar.gz b/examples/data/airbnb.tar.gz deleted file mode 100644 index e6294bb6e..000000000 Binary files a/examples/data/airbnb.tar.gz and /dev/null differ diff --git a/rdt/__init__.py b/rdt/__init__.py index 6a6f73f27..b716b66d7 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -2,6 +2,69 @@ """Top-level package for RDT.""" + __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.2' +__version__ = '0.1.3-dev' + +import json +import os + +import pandas as pd + +from rdt.hyper_transformer import HyperTransformer + + +def _lookup(elements, field, value): + for element in elements: + if element[field] == value: + return element + + raise ValueError('Invalid {}: {}'.format(field, value)) + + +def load_data(metadata_path, table_name, column_name=None): + """Load the metadata and data from the indicated table. + + If a column name is also given, restrict the data and metadata results + to the indicated column. + + Args: + metadata_path (str): + Path to the metadata file. + table_name (str): + Name of the table. + column_name (str): + Name of the column. Optional. + + Returns: + pandas.DataFrame, dict + * Table or column loaded as a ``pandas.DataFrame``. + * The table or column metadata. + """ + with open(metadata_path, 'r') as metadata_file: + metadata = json.load(metadata_file) + + table_metadata = _lookup(metadata['tables'], 'name', table_name) + + data_path = os.path.join( + os.path.dirname(metadata_path), + metadata['path'], + table_metadata['path'] + ) + table_data = pd.read_csv(data_path) + + if column_name is None: + return table_data, table_metadata + + else: + column_metadata = _lookup(table_metadata['fields'], 'name', column_name) + column_data = table_data[column_name].to_frame() + + return column_data, column_metadata + + +__all__ = [ + 'HyperTransformer', + 'load_data' +] diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index 9a6da6880..634e5b42e 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -22,13 +22,22 @@ class HyperTransformer: """Multitable transformer. - Arguments: - metadata(str or dict): Path to the meta.json file or its parsed contents. - dir_name(str): Path to the root directory of meta.json. - missing (bool): Wheter or not to handle missing values before transforming data. - The main propouse of the HyperTransformer class is to easily manage transformations for a whole dataset. + + Args: + metadata (str or dict): + Path to the meta.json file or its parsed contents. + dir_name (str): + Path to the root directory of meta.json. Defaults to ``None``. + missing (bool): + Wheter or not to handle missing values before transforming data. + Defaults to ``True``. + + Raises: + ValueError: + A ``ValueError`` is raised when the ``metadata`` is a ``dict`` and ``dir_name`` + is ``None`` or when the instance of ``metadata`` is not a ``str`` or a ``dict``. """ @staticmethod @@ -36,7 +45,8 @@ def get_class(class_name): """Get class object of transformer from its class name. Args: - class_name(str): Name of the transform. + class_name (str): + Name of the transform. Returns: BaseTransformer @@ -48,23 +58,28 @@ def _get_pii_fields(table_metadata): """Return a list of fields marked as sensitive information. Args: - table_metadata (dict): Metadata corresponding to a table. + table_metadata (dict): + Metadata corresponding to a table. Returns: - list[dict]: List of metadata for each field marked as `pii`. + list[dict]: + List of metadata for each field marked as ``pii``. """ return [field for field in table_metadata['fields'] if field.get('pii')] @classmethod def _anonymize_table(cls, table_data, pii_fields): - """Anonymize in `table_data` the fields in `pii_fields`. + """Anonymize in ``table_data`` the fields in ``pii_fields``. Args: - table_data (pandas.DataFrame): Original dataframe/table. - pii_fields (list[dict]): Metadata for the fields to transform. + table_data (pandas.DataFrame): + Original dataframe/table. + pii_fields (list[dict]): + Metadata for the fields to transform. - Result: - pandas.DataFrame: Anonymized table. + Returns: + pandas.DataFrame: + Anonymized table. """ for pii_field in pii_fields: field_name = pii_field['name'] @@ -77,13 +92,15 @@ def _get_tables(self, base_dir): """Load the contents of meta_file and the corresponding data. If fields containing Personally Identifiable Information are detected in the metadata - they are anonymized before asign them into `table_dict`. + they are anonymized before asign them into ``table_dict``. Args: - base_dir(str): Root folder of the dataset files. + base_dir (str): + Root folder of the dataset files. Returns: - dict: Mapping str -> tuple(pandas.DataFrame, dict) + dict: + Mapping str -> tuple(pandas.DataFrame, dict) """ table_dict = {} @@ -102,7 +119,8 @@ def _get_transformers(self): """Load the contents of meta_file and extract information about the transformers. Returns: - dict: tuple(str, str) -> Transformer. + dict: + tuple(str, str) -> Transformer. """ transformer_dict = {} @@ -139,18 +157,23 @@ def __init__(self, metadata, dir_name=None, missing=True): self.missing = missing def _fit_transform_column(self, table, metadata, transformer_name, table_name): - """Transform a column from table using transformer and given parameters. + """Transform a column from ``table`` using transformer and given parameters. Args: - table (pandas.DataFrame): Dataframe containing column to transform. - metadata (dict): Metadata for given column. - transformer_name (str): Name of transformer to use on column. - table_name (str): Name of table in original dataset. + table (pandas.DataFrame): + Dataframe containing column to transform. + metadata (dict): + Metadata for given column. + transformer_name (str): + Name of transformer to use on column. + table_name (str): + Name of table in original dataset. Returns: - pandas.DataFrame: Dataframe containing the transformed column. If self.missing=True, - it will contain a second column containing 0 and 1 marking if that - value was originally null or not. + pandas.DataFrame: + Dataframe containing the transformed column. If ``self.missing=True``, + it will contain a second column containing 0 and 1 marking if that + value was originally null or not. """ column_name = metadata['name'] @@ -175,18 +198,22 @@ def _fit_transform_column(self, table, metadata, transformer_name, table_name): return pd.DataFrame(content, columns=columns) def _reverse_transform_column(self, table, metadata, table_name): - """Reverses the transformtion on a column from table using the given parameters. + """Reverses the transformation on a column from ``table`` using the given parameters. Args: - table (pandas.DataFrame): Dataframe containing column to transform. - metadata (dict): Metadata for given column. - table_name (str): Name of table in original dataset. + table (pandas.DataFrame): + Dataframe containing column to transform. + metadata (dict): + Metadata for given column. + table_name (str): + Name of the table in the original dataset. Returns: - pandas.DataFrame: Dataframe containing the transformed column. If self.missing=True, - it will contain a second column containing 0 and 1 marking if that - value was originally null or not. - It will return None in the case the column is not in the table. + pandas.DataFrame: + Dataframe containing the transformed column. If ``self.missing=True``, + it will contain a second column containing 0 and 1 marking if that + value was originally null or not. + It will return ``None`` in the case the column is not in the table. """ column_name = metadata['name'] @@ -206,32 +233,32 @@ def _reverse_transform_column(self, table, metadata, table_name): return content - def fit_transform_table( - self, table, table_meta, transformer_dict=None, transformer_list=None, missing=None): - """Create, apply and store the specified transformers for `table`. + def fit_transform_table(self, table, table_meta, transformer_dict=None, + transformer_list=None, missing=None): + """Create, apply and store the specified transformers for ``table``. Args: - table(pandas.DataFrame): Contents of the table to be transformed. - - table_meta(dict): Metadata for the given table. - - transformer_dict(dict): Mapping `tuple(str, str)` -> `str` where the tuple in the - keys represent the (table_name, column_name) and the value - the name of the assigned transformer. - - transformer_list(list): List of transformers to use. Overrides the transformers in - the meta_file. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + table (pandas.DataFrame): + Contents of the table to be transformed. + table_meta (dict): + Metadata for the given table. + transformer_dict (dict): + Mapping ``tuple(str, str)`` -> ``str`` where the tuple in the keys represent + the ``(table_name, column_name)`` and the value the name + of the assigned transformer. Defaults to ``None``. + transformer_list (list): + List of transformers to use. Overrides the transformers in the ``meta_file``. + Defaults to ``None``. + missing (bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None`` Returns: - pandas.DataFrame: Transformed table. + pandas.DataFrame: + Transformed table. """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn(DEPRECATION_MESSAGE.format('fit_transform_table'), DeprecationWarning) @@ -259,23 +286,23 @@ def fit_transform_table( return result def transform_table(self, table, table_meta, missing=None): - """Apply the stored transformers to `table`. + """Apply the stored transformers to ``table``. Args: - table(pandas.DataFrame): Contents of the table to be transformed. - - table_meta(dict): Metadata for the given table. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + table (pandas.DataFrame): + Contents of the table to be transformed. + table_meta (dict): + Metadata for the given table. + missing (bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None`` Returns: - pandas.DataFrame: Transformed table. + pandas.DataFrame: + Transformed table. """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn(DEPRECATION_MESSAGE.format('transform_table'), DeprecationWarning) @@ -304,23 +331,23 @@ def transform_table(self, table, table_meta, missing=None): return pd.DataFrame(content, columns=columns) def reverse_transform_table(self, table, table_meta, missing=None): - """Transform a `table` back to its original format. + """Transform a ``table`` back to its original format. Args: - table(pandas.DataFrame): Contents of the table to be transformed. - - table_meta(dict): Metadata for the given table. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + table (pandas.DataFrame): + Contents of the table to be transformed. + table_meta (dict): + Metadata for the given table. + missing (bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None`` Returns: - pandas.DataFrame: Table in original format. + pandas.DataFrame: + Table in original format. """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn( DEPRECATION_MESSAGE.format('reverse_transform_table'), DeprecationWarning) @@ -335,32 +362,33 @@ def reverse_transform_table(self, table, table_meta, missing=None): return result - def fit_transform( - self, tables=None, transformer_dict=None, transformer_list=None, missing=None): + def fit_transform(self, tables=None, transformer_dict=None, + transformer_list=None, missing=None): """Create, apply and store the specified transformers for the given tables. Args: - tables(dict): Mapping of table names to `tuple` where each tuple is on the form - (`pandas.DataFrame`, `dict`). The `DataFrame` contains the table data - and the `dict` the corresponding meta information. - If not specified, the tables will be retrieved using the meta_file. - - transformer_dict(dict): Mapping `tuple(str, str)` -> `str` where the tuple is - (table_name, column_name). - - transformer_list(list): List of transformers to use. Overrides the transformers in - the meta_file. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + tables (dict): + Mapping of table names to ``tuple`` where each tuple is on the form + ``(pandas.DataFrame, dict)``. The ``DataFrame`` contains the table data + and the ``dict`` the corresponding meta information. + If not specified, the tables will be retrieved using the meta_file. + Defaults to ``None``. + transformer_dict (dict): + Mapping ``tuple(str, str)`` -> ``str`` where the tuple is + ``(table_name, column_name)``. Defaults to ``None``. + transformer_list (list): + List of transformers to use. Overrides the transformers in the ``meta_file``. + Defaults to ``None``. + missing (bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None``. Returns: - dict: Map from `str` (table_names) to `pandas.DataFrame` (transformed data). + dict: + Map from ``str`` (table_names) to ``pandas.DataFrame`` (transformed data). """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn(DEPRECATION_MESSAGE.format('fit_transform'), DeprecationWarning) @@ -382,26 +410,26 @@ def fit_transform( return transformed def transform(self, tables, table_metas=None, missing=None): - """Apply all the saved transformers to `tables`. + """Apply all the saved transformers to ``tables``. Args: - tables(dict): mapping of table names to `tuple` where each tuple is on the form - (`pandas.DataFrame`, `dict`). The `DataFrame` contains the table data - and the `dict` the corresponding meta information. - If not specified, the tables will be retrieved using the meta_file. - - table_metas(dict): Full metadata file for the dataset. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + tables(dict): + mapping of table names to ``tuple`` where each tuple is on the form + (``pandas.DataFrame``, ``dict``). The ``DataFrame`` contains the table data + and the ``dict`` the corresponding meta information. + If not specified, the tables will be retrieved using the meta_file. + table_metas(dict): + Full metadata file for the dataset. Defaults to ``None``. + missing(bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None``. Returns: - dict: Map from `str` (table_names) to `pandas.DataFrame` (transformed data). + dict: + Map from ``str`` (table_names) to ``pandas.DataFrame`` (transformed data). """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn(DEPRECATION_MESSAGE.format('transform'), DeprecationWarning) @@ -423,23 +451,23 @@ def reverse_transform(self, tables, table_metas=None, missing=None): """Transform data back to its original format. Args: - tables(dict): mapping of table names to `tuple` where each tuple is on the form - (`pandas.DataFrame`, `dict`). The `DataFrame` contains the transformed - data and the `dict` the corresponding meta information. - If not specified, the tables will be retrieved using the meta_file. - - table_metas(dict): Full metadata file for the dataset. - - missing(bool): Wheter or not use NullTransformer to handle missing values. + tables(dict): + mapping of table names to ``tuple`` where each tuple is on the form + (``pandas.DataFrame``, ``dict``). The ``DataFrame`` contains the transformed + data and the ``dict`` the corresponding meta information. + If not specified, the tables will be retrieved using the meta_file. + table_metas(dict): + Full metadata file for the dataset. Defaults to ``None``. + missing(bool): + Wheter or not use ``NullTransformer`` to handle missing values. + Defaults to ``None`` Returns: - dict: Map from `str` (table_names) to `pandas.DataFrame` (transformed data). + dict: + Map from ``str`` (table_names) to ``pandas.DataFrame`` (transformed data). """ - if missing is None: - missing = self.missing - - else: + if missing is not None: self.missing = missing warnings.warn(DEPRECATION_MESSAGE.format('reverse_transform'), DeprecationWarning) diff --git a/rdt/rdt.py b/rdt/rdt.py deleted file mode 100644 index 7fbbae4f9..000000000 --- a/rdt/rdt.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Main module.""" diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index 5c31e9749..754d26d8e 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -1,65 +1,13 @@ -import json -import os - -import pandas as pd - from rdt.transformers.base import BaseTransformer from rdt.transformers.category import CatTransformer from rdt.transformers.datetime import DTTransformer from rdt.transformers.null import NullTransformer from rdt.transformers.number import NumberTransformer - -def load_data_table(table_name, meta_file, meta): - """Return the contents and metadata of a given table. - - Args: - table_name(str): Name of the table. - meta_file(str): Path to the meta.json file. - meta(dict): Contents of meta.json. - - Returns: - tuple(pandas.DataFrame, dict) - - """ - for table in meta['tables']: - if table['name'] == table_name: - prefix = os.path.dirname(meta_file) - relative_path = os.path.join(prefix, meta['path'], table['path']) - return pd.read_csv(relative_path), table - - -def get_col_info(table_name, col_name, meta_file): - """Return the content and metadata of a fiven column. - - Args: - table_name(str): Name of the table. - col_name(str): Name of the column. - meta_file(str): Path to the meta.json file. - - Returns: - tuple(pandas.Series, dict) - """ - - with open(meta_file, 'r') as f: - meta = json.load(f) - - data_table, table = load_data_table(table_name, meta_file, meta) - - for field in table['fields']: - if field['name'] == col_name: - col_meta = field - - col = data_table[col_name] - - return (col, col_meta) - - __all__ = [ 'BaseTransformer', 'CatTransformer', 'DTTransformer', 'NullTransformer', 'NumberTransformer', - 'get_col_info' ] diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index c2a173515..cb83c9489 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -7,8 +7,10 @@ def __init__(self, column_metadata): """Initialize preprocessor. Args: - column_metadata(dict): Meta information of the column. - transformer_type(str): Type of data the transformer is able to transform. + column_metadata (dict): + Meta information of the column. + transformer_type (str): + Type of data the transformer is able to transform. """ self.column_metadata = column_metadata self.col_name = column_metadata['name'] @@ -19,7 +21,8 @@ def fit(self, col): """Prepare the transformer to convert data. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. """ raise NotImplementedError @@ -27,7 +30,8 @@ def transform(self, col): """Does the required transformations to the data. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -38,7 +42,8 @@ def fit_transform(self, col): """Prepare the transformer to convert data and return the processed table. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -50,7 +55,8 @@ def reverse_transform(self, col): """Converts data back into original format. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -61,9 +67,12 @@ def check_data_type(self): """Check the type of the transformer and column match. Args: - column_metadata(dict): Metadata of the column. + column_metadata (dict): + Metadata of the column. - Raises a ValueError if the types don't match + Raises: + ValueError: + A ``ValueError`` is raised if the types don't match. """ metadata_type = self.column_metadata.get('type') if self.type != metadata_type and metadata_type not in self.type: diff --git a/rdt/transformers/category.py b/rdt/transformers/category.py index 50d53b11d..aadf4fc61 100644 --- a/rdt/transformers/category.py +++ b/rdt/transformers/category.py @@ -9,28 +9,32 @@ class CatTransformer(BaseTransformer): """Transformer for categorical data. - Args: - column_metadata(dict): Meta information of the column. - anonymize (bool): Wheter or not replace the values of col before generating the - categorical_map. - - category (str): The type of data to ask faker for when anonimizing. - - This transformer expects a `column` pandas.Series of any dtype in a pandas.DataFrame `table`. - On transform, it will map categorical values into the interval [0, 1], back and forth mapping - all the unique values close to their frequency in the fit data. - This mean, that two instances of the same category may not be transformed into the same number. - - On reverse_transform it will transform any value close to the frenquency to their related - category. This behavior is to allow the transformed data to be modelled and the sampled - data to be reverse_transformed. + This transformer expects a ``column`` ``pandas.Series`` of any dtype in + a ``pandas.DataFrame`` table. On transform, it will map categorical values + into the interval [0, 1], back and forth mapping all the unique values close + to their frequency in the fit data. - Please note the following behavior, for any column: - - >>> result = transformer.fit_transform(column) - >>> assert result[0 <= result <= 1].all() + This means that two instances of the same category may not be transformed into + the same number. + On ``reverse_transform`` it will transform any value close to the frenquency + to their related category. This behavior is to allow the transformed data to be + modelled and the sampled data to be ``reverse_transformed``. + Args: + column_metadata (dict): + Meta information of the column. + anonymize (bool): + Wheter or not replace the values of col before generating the + ``categorical_map``. + category (str): + The type of data to ask faker for when anonimizing. + + Example: + Please note the following behavior, for any column: + + >>> result = transformer.fit_transform(column) + >>> assert result[0 <= result <= 1].all() """ type = 'categorical' @@ -67,20 +71,23 @@ def get_generator(self): def anonymize_column(self, col): """Map the values of column to new ones of the same type. - It replaces the values from others generated using `faker`. It will however, - keep the original distribution. That mean that the generated `probability_map` for both - will have the same values, but different keys. + It replaces the values from others generated using ``faker``. However, + it will keep the original distribution. That means that the generated + ``probability_map`` for both will have the same values, but different keys. Args: - col (pandas.DataFrame): Dataframe containing the column to anonymize. + col (pandas.DataFrame): + Dataframe containing the column to anonymize. Returns: - pd.DataFrame: DataFrame with its values mapped to new ones, - keeping the original distribution. + pandas.DataFrame: + DataFrame with it's values mapped to new ones, keeping the original + distribution. Raises: - ValueError: A `ValueError` is raised if faker is not able to provide enought - different values. + ValueError: + A ``ValueError`` is raised if ``faker`` is not able to provide enought + different values. """ column = col[self.col_name] @@ -103,15 +110,17 @@ def anonymize_column(self, col): def fit(self, col): """Prepare the transfomer to process data. - This method can only be used if `anonymize` is False. - Otherwise, please use `fit_transform`. + This method can only be used if ``anonymize`` is False. + Otherwise, please use ``fit_transform``. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Raises: - ValueError: If this method is called and `anonymize` is True. - + ValueError: + A ``ValueError`` is raised if this method is called and + ``self.anonymize`` is True. """ if self.anonymize: @@ -125,7 +134,8 @@ def _fit(self, col): """Create a map of the empirical probability for each category. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. """ column = col[self.col_name].replace({np.nan: np.inf}) @@ -148,7 +158,8 @@ def transform(self, col): """Prepare the transformer to convert data and return the processed table. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -166,7 +177,8 @@ def fit_transform(self, col): """Prepare the transformer and return processed data. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -182,7 +194,8 @@ def reverse_transform(self, col): """Converts data back into original format. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -203,7 +216,8 @@ def get_category(self, column): """Returns categories for the specified numeric values Args: - column(pandas.Series): Values to transform into categories + column (pandas.Series): + Values to transform into categories Returns: pandas.Series diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 60aeb7623..be7f89e2b 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -24,10 +24,8 @@ def fit(self, col): """Prepare the transformer to convert data. Args: - col(pandas.DataFrame): Data to transform. - - Returns: - None + col (pandas.DataFrame): + Data to transform. """ dates = self.safe_datetime_cast(col) self.default_val = dates.groupby(dates).count().index[0].timestamp() * 1e9 @@ -36,7 +34,8 @@ def transform(self, col): """Prepare the transformer to convert data and return the processed table. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -54,7 +53,8 @@ def reverse_transform(self, col): """Converts data back into original format. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -71,7 +71,8 @@ def safe_datetime_cast(self, col): """Parses string values into datetime. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.Series @@ -90,7 +91,8 @@ def to_timestamp(self, data): """Transform a datetime series into linux epoch. Args: - data(pandas.DataFrame): DataFrame containins a column named as `self.col_name`. + data (pandas.DataFrame): + DataFrame containins a column named as ``self.col_name``. Returns: pandas.Series @@ -102,10 +104,11 @@ def to_timestamp(self, data): return result def safe_date(self, x): - """Transform x[self.col_name] into a date string. + """Transform ``x[self.col_name]`` into a date string. Args: - x(dict like / pandas.Series): Row containing data to cast safely. + x (dict-like or pandas.Series): + Row containing data to cast safely. Returns: str diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index f480dee2d..61d6452d8 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -34,7 +34,8 @@ def transform(self, col): """Prepare the transformer to convert data and return the processed table. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -48,7 +49,8 @@ def reverse_transform(self, col): """Converts data back into original format. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame diff --git a/rdt/transformers/number.py b/rdt/transformers/number.py index cd20fb5c1..2916ffa74 100644 --- a/rdt/transformers/number.py +++ b/rdt/transformers/number.py @@ -21,7 +21,8 @@ def fit(self, col): """Sets the default value. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -32,7 +33,8 @@ def transform(self, col): """Prepare the transformer to convert data and return the processed table. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -50,7 +52,8 @@ def reverse_transform(self, col): """Converts data back into original format. Args: - col(pandas.DataFrame): Data to transform. + col (pandas.DataFrame): + Data to transform. Returns: pandas.DataFrame @@ -65,7 +68,6 @@ def reverse_transform(self, col): return output def get_default_value(self, data): - """ """ col = data[self.col_name] uniques = col[~col.isnull()].unique() if not len(uniques): @@ -97,11 +99,13 @@ def safe_round(self, x): """Returns a converter that takes in a value and turns it into an integer, if necessary. Args: - col_name(str): Name of the column. - subtype(str): Numeric subtype of the values. + col_name (str): + Name of the column. + subtype (str): + Numeric subtype of the values. Returns: - function + int """ val = x[self.col_name] diff --git a/rdt/transformers/positive_number.py b/rdt/transformers/positive_number.py index a6451af95..8125ff083 100644 --- a/rdt/transformers/positive_number.py +++ b/rdt/transformers/positive_number.py @@ -15,10 +15,11 @@ def transform(self, column): """Applies an exponential to values to turn them positive numbers. Args: - column (pandas.DataFrame): Data to transform. + column (pandas.DataFrame): + Data to transform. Returns: - pd.DataFrame + pandas.DataFrame """ self.check_data_type() @@ -28,10 +29,11 @@ def reverse_transform(self, column): """Applies the natural logarithm function to turn positive values into real ranged values. Args: - column (pandas.DataFrame): Data to transform. + column (pandas.DataFrame): + Data to transform. Returns: - pd.DataFrame + pandas.DataFrame """ self.check_data_type() diff --git a/setup.cfg b/setup.cfg index 3fa89c1fb..3a9d3772e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.2 +current_version = 0.1.3-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 55e7b42d3..4d379980b 100644 --- a/setup.py +++ b/setup.py @@ -26,16 +26,31 @@ ] development_requires = [ + # general 'bumpversion>=0.5.3', - 'Sphinx>=1.7.1', 'recommonmark>=0.4.0', + + # docs + 'm2r>=0.2.0', + 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', + 'autodocsumm>=0.1.10', + + # style check 'flake8>=3.5.0', 'isort>=4.3.4', + + # fix style issues 'autoflake>=1.1', 'autopep8>=1.3.5', + + # distribute on Pypi 'twine>=1.10.0', 'wheel>=0.30.0', + + # advanced testing + 'coverage>=4.5.1', + 'tox>=2.9.1', ] setup( @@ -69,6 +84,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-Project/RDT', - version='0.1.2', + version='0.1.3-dev', zip_safe=False, ) diff --git a/tox.ini b/tox.ini index b3e196741..69582ad7a 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = PYTHONPATH = {toxinidir} extras = test commands = - /usr/bin/env python -m pytest --cov=rdt + /usr/bin/env make test [testenv:lint]