From 83d1535ce63c4b9fe2500d9ee405e0d76f15d363 Mon Sep 17 00:00:00 2001
From: Carles Sala <carles@pythiac.com>
Date: Fri, 19 Oct 2018 19:35:22 -0400
Subject: [PATCH] First commit

---
 .editorconfig             |  24 ++
 .github/ISSUE_TEMPLATE.md |  15 ++
 .gitignore                | 108 ++++++++
 .travis.yml               |  31 +++
 AUTHORS.rst               |  13 +
 CONTRIBUTING.rst          | 206 ++++++++++++++++
 HISTORY.md                |   5 +
 LICENSE                   |  22 ++
 MANIFEST.in               |  11 +
 Makefile                  | 197 +++++++++++++++
 README.md                 |  10 +
 docs/Makefile             |  20 ++
 docs/authors.rst          |   1 +
 docs/conf.py              | 207 ++++++++++++++++
 docs/contributing.rst     |   1 +
 docs/history.rst          |   1 +
 docs/index.rst            |  30 +++
 docs/installation.rst     |  51 ++++
 docs/make.bat             |  36 +++
 docs/usage.rst            |   9 +
 mit_d3m/__init__.py       |  73 ++++++
 mit_d3m/config.py         |  97 ++++++++
 mit_d3m/dataset.py        | 326 ++++++++++++++++++++++++
 mit_d3m/db.py             |  39 +++
 mit_d3m/loaders.py        | 505 ++++++++++++++++++++++++++++++++++++++
 mit_d3m/metrics.py        |  97 ++++++++
 mit_d3m/stats.py          | 114 +++++++++
 mit_d3m/utils.py          |  78 ++++++
 setup.cfg                 |  44 ++++
 setup.py                  |  96 ++++++++
 tests/test_mit_d3m.py     |  24 ++
 tox.ini                   |  29 +++
 32 files changed, 2520 insertions(+)
 create mode 100644 .editorconfig
 create mode 100644 .github/ISSUE_TEMPLATE.md
 create mode 100644 .gitignore
 create mode 100644 .travis.yml
 create mode 100644 AUTHORS.rst
 create mode 100644 CONTRIBUTING.rst
 create mode 100644 HISTORY.md
 create mode 100644 LICENSE
 create mode 100644 MANIFEST.in
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 docs/Makefile
 create mode 100644 docs/authors.rst
 create mode 100644 docs/conf.py
 create mode 100644 docs/contributing.rst
 create mode 100644 docs/history.rst
 create mode 100644 docs/index.rst
 create mode 100644 docs/installation.rst
 create mode 100644 docs/make.bat
 create mode 100644 docs/usage.rst
 create mode 100644 mit_d3m/__init__.py
 create mode 100644 mit_d3m/config.py
 create mode 100644 mit_d3m/dataset.py
 create mode 100644 mit_d3m/db.py
 create mode 100644 mit_d3m/loaders.py
 create mode 100644 mit_d3m/metrics.py
 create mode 100644 mit_d3m/stats.py
 create mode 100644 mit_d3m/utils.py
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 create mode 100644 tests/test_mit_d3m.py
 create mode 100644 tox.ini

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..aa6cf4e
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,24 @@
+# http://editorconfig.org
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+charset = utf-8
+end_of_line = lf
+
+[*.py]
+max_line_length = 79
+
+[*.bat]
+indent_style = tab
+end_of_line = crlf
+
+[LICENSE]
+insert_final_newline = false
+
+[Makefile]
+indent_style = tab
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..bcbdc85
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,15 @@
+* mit-d3m version:
+* Python version:
+* Operating System:
+
+### Description
+
+Describe what you were trying to get done.
+Tell us what happened, what went wrong, and what you expected to happen.
+
+### What I Did
+
+```
+Paste the command(s) you ran and the output.
+If there was a crash, please include the traceback here.
+```
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..442805b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,108 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# vim
+.*.swp
+
+data
+docs/api
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..5cb49ee
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,31 @@
+# Config file for automatic testing at travis-ci.org
+language: python
+python:
+  - 3.6
+  - 3.5
+  - 3.4
+
+# Command to install dependencies
+install: pip install -U tox-travis
+
+# Command to run tests
+script: tox
+
+deploy:
+
+
+  # Automatically build and deploy documentation to GitHub Pages after every
+  # commit
+  # Follow the instructions at https://docs.travis-ci.com/user/deployment/pages/
+  # to setup a personal deployment token and then provide it as a secure
+  # environment variable at https://travis-ci.org/HDI-Project/mit-d3m/settings
+  - provider: pages
+    skip-cleanup: true
+    github-token: "$GITHUB_TOKEN"
+    keep-history: true
+    local-dir: docs/_build/html
+    target-branch: gh-pages
+    on:
+      branch: master
+      python: 3.6
+
diff --git a/AUTHORS.rst b/AUTHORS.rst
new file mode 100644
index 0000000..1f40289
--- /dev/null
+++ b/AUTHORS.rst
@@ -0,0 +1,13 @@
+=======
+Credits
+=======
+
+Development Lead
+----------------
+
+* MIT Data To AI Lab <dailabmit@gmail.com>
+
+Contributors
+------------
+
+None yet. Why not be the first?
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
new file mode 100644
index 0000000..764001f
--- /dev/null
+++ b/CONTRIBUTING.rst
@@ -0,0 +1,206 @@
+.. highlight:: shell
+
+============
+Contributing
+============
+
+Contributions are welcome, and they are greatly appreciated! Every little bit
+helps, and credit will always be given.
+
+You can contribute in many ways:
+
+Types of Contributions
+----------------------
+
+Report Bugs
+~~~~~~~~~~~
+
+Report bugs at the `GitHub Issues page`_.
+
+If you are reporting a bug, please include:
+
+* Your operating system name and version.
+* Any details about your local setup that might be helpful in troubleshooting.
+* Detailed steps to reproduce the bug.
+
+Fix Bugs
+~~~~~~~~
+
+Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
+wanted" is open to whoever wants to implement it.
+
+Implement Features
+~~~~~~~~~~~~~~~~~~
+
+Look through the GitHub issues for features. Anything tagged with "enhancement"
+and "help wanted" is open to whoever wants to implement it.
+
+Write Documentation
+~~~~~~~~~~~~~~~~~~~
+
+mit-d3m could always use more documentation, whether as part of the
+official mit-d3m docs, in docstrings, or even on the web in blog posts,
+articles, and such.
+
+Submit Feedback
+~~~~~~~~~~~~~~~
+
+The best way to send feedback is to file an issue at the `GitHub Issues page`_.
+
+If you are proposing a feature:
+
+* Explain in detail how it would work.
+* Keep the scope as narrow as possible, to make it easier to implement.
+* Remember that this is a volunteer-driven project, and that contributions
+  are welcome :)
+
+Get Started!
+------------
+
+Ready to contribute? Here's how to set up `mit-d3m` for local development.
+
+1. Fork the `mit-d3m` repo on GitHub.
+2. Clone your fork locally::
+
+    $ git clone git@github.com:your_name_here/mit-d3m.git
+
+3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed,
+   this is how you set up your fork for local development::
+
+    $ mkvirtualenv mit-d3m
+    $ cd mit-d3m/
+    $ make install-develop
+
+4. Create a branch for local development::
+
+    $ git checkout -b name-of-your-bugfix-or-feature
+
+   Try to use the naming scheme of prefixing your branch with ``gh-X`` where X is
+   the associated issue, such as ``gh-3-fix-foo-bug``. And if you are not
+   developing on your own fork, further prefix the branch with your GitHub
+   username, like ``githubusername/gh-3-fix-foo-bug``.
+
+   Now you can make your changes locally.
+
+5. While hacking your changes, make sure to cover all your developments with the required
+   unit tests, and that none of the old tests fail as a consequence of your changes.
+   For this, make sure to run the tests suite and check the code coverage::
+
+    $ make lint       # Check code styling
+    $ make test       # Run the tests
+    $ make coverage   # Get the coverage report
+
+6. When you're done making changes, check that your changes pass all the styling checks and
+   tests, including other Python supported versions, using::
+
+    $ make test-all
+
+7. Make also sure to include the necessary documentation in the code as docstrings following
+   the `Google docstrings style`_.
+   If you want to view how your documentation will look like when it is published, you can
+   generate and view the docs with this command::
+
+    $ make view-docs
+
+8. Commit your changes and push your branch to GitHub::
+
+    $ git add .
+    $ git commit -m "Your detailed description of your changes."
+    $ git push origin name-of-your-bugfix-or-feature
+
+9. Submit a pull request through the GitHub website.
+
+Pull Request Guidelines
+-----------------------
+
+Before you submit a pull request, check that it meets these guidelines:
+
+1. It resolves an open GitHub Issue and contains its reference in the title or
+   the comment. If there is no associated issue, feel free to create one.
+2. Whenever possible, it resolves only **one** issue. If your PR resolves more than
+   one issue, try to split it in more than one pull request.
+3. The pull request should include unit tests that cover all the changed code
+4. If the pull request adds functionality, the docs should be updated. Put
+   your new functionality into a function with a docstring, and add the
+   feature to the documentation in an appropriate place.
+5. The pull request should work for all the supported Python versions. Check the `Travis Build
+   Status page`_ and make sure that all the checks pass.
+
+Unit Testing Guidelines
+-----------------------
+
+All the Unit Tests should comply with the following requirements:
+
+1. Unit Tests should be based only in unittest and pytest modules.
+
+2. The tests that cover a module called ``mit_d3m/path/to/a_module.py``
+   should be implemented in a separated module called
+   ``tests/mit_d3m/path/to/test_a_module.py``.
+   Note that the module name has the ``test_`` prefix and is located in a path similar
+   to the one of the tested module, just inside the ``tests`` folder.
+
+3. Each method of the tested module should have at least one associated test method, and
+   each test method should cover only **one** use case or scenario.
+
+4. Test case methods should start with the ``test_`` prefix and have descriptive names
+   that indicate which scenario they cover.
+   Names such as ``test_some_methed_input_none``, ``test_some_method_value_error`` or
+   ``test_some_method_timeout`` are right, but names like ``test_some_method_1``,
+   ``some_method`` or ``test_error`` are not.
+
+5. Each test should validate only what the code of the method being tested does, and not
+   cover the behavior of any third party package or tool being used, which is assumed to
+   work properly as far as it is being passed the right values.
+
+6. Any third party tool that may have any kind of random behavior, such as some Machine
+   Learning models, databases or Web APIs, will be mocked using the ``mock`` library, and
+   the only thing that will be tested is that our code passes the right values to them.
+
+7. Unit tests should not use anything from outside the test and the code being tested. This
+   includes not reading or writing to any file system or database, which will be properly
+   mocked.
+
+Tips
+----
+
+To run a subset of tests::
+
+    $ python -m pytest tests.test_mit_d3m
+    $ python -m pytest -k 'foo'
+
+Release Workflow
+----------------
+
+The process of releasing a new version involves several steps combining both ``git`` and
+``bumpversion`` which, briefly:
+
+1. Merge what is in ``master`` branch into ``stable`` branch.
+2. Update the version in ``setup.cfg``, ``mit_d3m/__init__.py`` and
+   ``HISTORY.md`` files.
+3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
+4. Merge the new commit from ``stable`` into ``master``.
+5. Update the version in ``setup.cfg`` and ``mit_d3m/__init__.py``
+   to open the next development iteration.
+
+.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
+          entry that explains the changes that will be included in the new version.
+          Normally this is just a list of the Pull Requests that have been merged to master
+          since the last release.
+
+Once this is done, run of the following commands:
+
+1. If you are releasing a patch version::
+
+    make release
+
+2. If you are releasing a minor version::
+
+    make release-minor
+
+3. If you are releasing a major version::
+
+    make release-major
+
+.. _GitHub issues page: https://github.com/HDI-Project/mit-d3m/issues
+.. _Travis Build Status page: https://travis-ci.org/HDI-Project/mit-d3m/pull_requests
+.. _Google docstrings style: https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments
diff --git a/HISTORY.md b/HISTORY.md
new file mode 100644
index 0000000..b43c2f5
--- /dev/null
+++ b/HISTORY.md
@@ -0,0 +1,5 @@
+# History
+
+## 0.1.0
+
+* First release on PyPI.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bdde18d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2018, MIT Data To AI Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..469520f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,11 @@
+include AUTHORS.rst
+include CONTRIBUTING.rst
+include HISTORY.md
+include LICENSE
+include README.md
+
+recursive-include tests *
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
+
+recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4b6fb8f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,197 @@
+.DEFAULT_GOAL := help
+
+define BROWSER_PYSCRIPT
+import os, webbrowser, sys
+
+try:
+	from urllib import pathname2url
+except:
+	from urllib.request import pathname2url
+
+webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
+endef
+export BROWSER_PYSCRIPT
+
+define PRINT_HELP_PYSCRIPT
+import re, sys
+
+for line in sys.stdin:
+	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+	if match:
+		target, help = match.groups()
+		print("%-20s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+
+BROWSER := python -c "$$BROWSER_PYSCRIPT"
+
+.PHONY: help
+help:
+	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+
+
+# CLEAN TARGETS
+
+.PHONY: clean-build
+clean-build: ## remove build artifacts
+	rm -fr build/
+	rm -fr dist/
+	rm -fr .eggs/
+	find . -name '*.egg-info' -exec rm -fr {} +
+	find . -name '*.egg' -exec rm -f {} +
+
+.PHONY: clean-pyc
+clean-pyc: ## remove Python file artifacts
+	find . -name '*.pyc' -exec rm -f {} +
+	find . -name '*.pyo' -exec rm -f {} +
+	find . -name '*~' -exec rm -f {} +
+	find . -name '__pycache__' -exec rm -fr {} +
+
+.PHONY: clean-docs
+clean-docs: ## remove previously built docs
+	rm -f docs/api/*.rst
+	-$(MAKE) -C docs clean 2>/dev/null  # this fails if sphinx is not yet installed
+
+.PHONY: clean-coverage
+clean-coverage: ## remove coverage artifacts
+	rm -f .coverage
+	rm -f .coverage.*
+	rm -fr htmlcov/
+
+.PHONY: clean-test
+clean-test: ## remove test artifacts
+	rm -fr .tox/
+	rm -fr .pytest_cache
+
+.PHONY: clean
+clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts
+
+
+# INSTALL TARGETS
+
+.PHONY: install
+install: clean-build clean-pyc ## install the package to the active Python's site-packages
+	pip install .
+
+.PHONY: install-test
+install-test: clean-build clean-pyc ## install the package and test dependencies
+	pip install .[test]
+
+.PHONY: install-develop
+install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
+	pip install -e .[dev]
+
+
+# LINT TARGETS
+
+.PHONY: lint
+lint: ## check style with flake8 and isort
+	flake8 mit_d3m tests
+	isort -c --recursive mit_d3m tests
+
+.PHONY: fix-lint
+fix-lint: ## fix lint issues using autoflake, autopep8, and isort
+	find mit_d3m -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
+	autopep8 --in-place --recursive --aggressive mit_d3m
+	isort --apply --atomic --recursive mit_d3m
+
+	find tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
+	autopep8 --in-place --recursive --aggressive tests
+	isort --apply --atomic --recursive tests
+
+
+# TEST TARGETS
+
+.PHONY: test
+test: ## run tests quickly with the default Python
+	python -m pytest
+
+.PHONY: test-all
+test-all: ## run tests on every Python version with tox
+	tox
+
+.PHONY: coverage
+coverage: ## check code coverage quickly with the default Python
+	coverage run --source mit_d3m -m pytest
+	coverage report -m
+	coverage html
+	$(BROWSER) htmlcov/index.html
+
+
+# DOCS TARGETS
+
+.PHONY: docs
+docs: clean-docs ## generate Sphinx HTML documentation, including API docs
+	sphinx-apidoc --module-first --separate -T -o docs/api/ mit_d3m
+	$(MAKE) -C docs html
+	touch docs/_build/html/.nojekyll
+
+.PHONY: view-docs
+view-docs: docs ## view docs in browser
+	$(BROWSER) docs/_build/html/index.html
+
+.PHONY: serve-docs
+serve-docs: view-docs ## compile the docs watching for changes
+	watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' .
+
+
+# RELEASE TARGETS
+
+.PHONY: dist
+dist: clean ## builds source and wheel package
+	python setup.py sdist
+	python setup.py bdist_wheel
+	ls -l dist
+
+.PHONY: test-publish
+test-publish: dist ## package and upload a release on TestPyPI
+	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
+
+.PHONY: publish
+publish: dist ## package and upload a release
+	twine upload dist/*
+
+.PHONY: bumpversion-release
+bumpversion-release: ## Merge master to stable and bumpversion release
+	git checkout stable
+	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
+	bumpversion release
+	git push --tags origin stable
+
+.PHONY: bumpversion-patch
+bumpversion-patch: ## Merge stable to master and bumpversion patch
+	git checkout master
+	git merge stable
+	bumpversion --no-tag patch
+	git push
+
+.PHONY: bumpversion-minor
+bumpversion-minor: ## Bump the version the next minor skipping the release
+	bumpversion --no-tag minor
+
+.PHONY: bumpversion-major
+bumpversion-major: ## Bump the version the next major skipping the release
+	bumpversion --no-tag major
+
+CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
+CHANGELOG_LINES := $(shell git diff HEAD..stable HISTORY.md | wc -l)
+
+.PHONY: check-release
+check-release: ## Check if the release can be made
+ifneq ($(CURRENT_BRANCH),master)
+	$(error Please make the release from master branch\n)
+endif
+ifeq ($(CHANGELOG_LINES),0)
+	$(error Please insert the release notes in HISTORY.md before releasing)
+else
+	@echo "A new release can be made"
+endif
+
+.PHONY: release
+release: check-release bumpversion-release publish bumpversion-patch
+
+.PHONY: release-minor
+release-minor: check-release bumpversion-minor release
+
+.PHONY: release-major
+release-major: check-release bumpversion-major release
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1e7c6f9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+[![PyPI Shield](https://img.shields.io/pypi/v/mit-d3m.svg)](https://pypi.python.org/pypi/mit-d3m)
+[![Travis CI Shield](https://travis-ci.org/HDI-Project/mit-d3m.svg?branch=master)](https://travis-ci.org/HDI-Project/mit-d3m)
+
+# mit-d3m
+
+MIT tools to work with D3M datasets.
+
+- Free software: MIT license
+- Documentation: https://HDI-Project.github.io/mit-d3m
+- Homepage: https://github.com/HDI-Project/mit-d3m
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..4132b36
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = mit-d3m
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/authors.rst b/docs/authors.rst
new file mode 100644
index 0000000..e122f91
--- /dev/null
+++ b/docs/authors.rst
@@ -0,0 +1 @@
+.. include:: ../AUTHORS.rst
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..93647d1
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# MLBlocks documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun  9 13:47:02 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another
+# directory, add these directories to sys.path here. If the directory is
+# relative to the documentation root, use os.path.abspath to make it
+# absolute, like shown here.
+
+# import os
+# import sys
+
+import sphinx_rtd_theme # For read the docs theme
+# from recommonmark.parser import CommonMarkParser
+# from recommonmark.transform import AutoStructify
+
+# sys.path.insert(0, os.path.abspath('..'))
+
+import mit_d3m
+
+# -- General configuration ---------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = [
+    'm2r',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.githubpages',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
+    # 'sphinx.ext.graphviz',
+    # 'IPython.sphinxext.ipython_console_highlighting',
+    # 'IPython.sphinxext.ipython_directive',
+]
+
+# ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+source_suffix = ['.rst', '.md'] #, '.ipynb']
+
+# source_parsers = {
+#     '.md': CommonMarkParser,
+# }
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'mit-d3m'
+slug = 'mit_d3m'
+title = project + ' Documentation',
+copyright = '2018, MIT Data To AI Lab'
+author = 'MIT Data To AI Lab'
+description = 'Pipelines and Primitives for Machine Learning and Data Science.'
+user = 'HDI-Project'
+
+# The version info for the project you're documenting, acts as replacement
+# for |version| and |release|, also used in various other places throughout
+# the built documents.
+#
+# The short X.Y version.
+version = mit_d3m.__version__
+# The full version, including alpha/beta/rc tags.
+release = mit_d3m.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output -------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Readthedocs additions
+html_context = {
+    'display_github': True,
+    'github_user': user,
+    'github_repo': project,
+    'github_version': 'master',
+    'conf_py_path': '/docs/',
+}
+
+# Theme options are theme-specific and customize the look and feel of a
+# theme further.  For a list of options available for each theme, see the
+# documentation.
+html_theme_options = {
+    'collapse_navigation': False,
+    'display_version': False,
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = 'images/favicon.ico'
+
+# If given, this must be the name of an image file (path relative to the
+# configuration directory) that is the logo of the docs. It is placed at
+# the top of the sidebar; its width should therefore not exceed 200 pixels.
+# html_logo = 'images/mlblocks-logo-small.png'
+
+# -- Options for HTMLHelp output ---------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = slug + 'doc'
+
+
+# -- Options for LaTeX output ------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass
+# [howto, manual, or own class]).
+latex_documents = [(
+    master_doc,
+    slug + '.tex',
+    title,
+    author,
+    'manual'
+)]
+
+
+# -- Options for manual page output ------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(
+    master_doc,
+    slug,
+    title,
+    [author],
+    1
+)]
+
+
+# -- Options for Texinfo output ----------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [(
+    master_doc,
+    slug,
+    title,
+    author,
+    slug,
+    description,
+    'Miscellaneous'
+)]
diff --git a/docs/contributing.rst b/docs/contributing.rst
new file mode 100644
index 0000000..e582053
--- /dev/null
+++ b/docs/contributing.rst
@@ -0,0 +1 @@
+.. include:: ../CONTRIBUTING.rst
diff --git a/docs/history.rst b/docs/history.rst
new file mode 100644
index 0000000..d26e5be
--- /dev/null
+++ b/docs/history.rst
@@ -0,0 +1 @@
+.. mdinclude:: ../HISTORY.md
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..9d6f467
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,30 @@
+.. mdinclude:: ../README.md
+
+.. toctree::
+   :hidden:
+   :titlesonly:
+
+   Overview <self>
+   installation
+   usage
+
+.. toctree::
+   :caption: Advanced Usage
+   :hidden:
+
+   API Reference <api/mit_d3m>
+
+.. toctree::
+   :caption: Development Notes
+   :hidden:
+
+   contributing
+   authors
+   history
+
+
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..b12217e
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,51 @@
+.. highlight:: shell
+
+============
+Installation
+============
+
+
+Stable release
+--------------
+
+To install mit-d3m, run this command in your terminal:
+
+.. code-block:: console
+
+    $ pip install mit-d3m
+
+This is the preferred method to install mit-d3m, as it will always install the most recent stable release.
+
+If you don't have `pip`_ installed, this `Python installation guide`_ can guide
+you through the process.
+
+.. _pip: https://pip.pypa.io
+.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
+
+
+From source
+------------
+
+The source for mit-d3m can be downloaded from the `Github repo`_.
+
+You can either clone the public repository:
+
+.. code-block:: console
+
+    $ git clone git://github.com/HDI-Project/mit-d3m
+
+Or download the `tarball`_:
+
+.. code-block:: console
+
+    $ curl  -OL https://github.com/HDI-Project/mit-d3m/tarball/master
+
+Once you have a copy of the source, you can install it with:
+
+.. code-block:: console
+
+    $ make install
+
+
+.. _Github repo: https://github.com/HDI-Project/mit-d3m
+.. _tarball: https://github.com/HDI-Project/mit-d3m/tarball/master
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..57e5ebe
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=python -msphinx
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=mit-d3m
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
+	echo.then set the SPHINXBUILD environment variable to point to the full
+	echo.path of the 'sphinx-build' executable. Alternatively you may add the
+	echo.Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/usage.rst b/docs/usage.rst
new file mode 100644
index 0000000..d1dfbf6
--- /dev/null
+++ b/docs/usage.rst
@@ -0,0 +1,9 @@
+=====
+Usage
+=====
+
+To use mit-d3m in a project:
+
+.. code-block:: python
+
+    import mit_d3m
diff --git a/mit_d3m/__init__.py b/mit_d3m/__init__.py
new file mode 100644
index 0000000..14c009b
--- /dev/null
+++ b/mit_d3m/__init__.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+"""Top-level package for mit-d3m."""
+
+__author__ = """MIT Data To AI Lab"""
+__email__ = 'dailabmit@gmail.com'
+__version__ = '0.1.0-dev'
+
+import os
+import shutil
+import tarfile
+
+import boto3
+
+from mit_d3m.dataset import D3MDS
+from mit_d3m.loaders import get_loader
+from mit_d3m.metrics import METRICS_DICT
+
+DATA_PATH = 'data'
+BUCKET = 'd3m-data-dai'
+
+
+def download_dataset(bucket, dataset, root_dir):
+    client = boto3.client('s3')
+
+    print("Downloading dataset {}".format(dataset))
+
+    key = 'datasets/' + dataset + '.tar.gz'
+    filename = root_dir + '.tar.gz'
+
+    print("Getting file {} from S3 bucket {}".format(key, bucket))
+    client.download_file(Bucket=bucket, Key=key, Filename=filename)
+
+    shutil.rmtree(root_dir, ignore_errors=True)
+
+    print("Extracting {}".format(filename))
+    with tarfile.open(filename, 'r:gz') as tf:
+        tf.extractall(os.path.dirname(root_dir))
+
+
+def load_d3mds(dataset, force_download=False):
+    if dataset.endswith('_dataset_TRAIN'):
+        dataset = dataset[:-14]
+
+    root_dir = os.path.join(DATA_PATH, dataset)
+
+    if force_download or not os.path.exists(root_dir):
+        if not os.path.exists(root_dir):
+            os.makedirs(root_dir)
+
+        download_dataset(BUCKET, dataset, root_dir)
+
+    phase_root = os.path.join(root_dir, 'TRAIN')
+    dataset_path = os.path.join(phase_root, 'dataset_TRAIN')
+    problem_path = os.path.join(phase_root, 'problem_TRAIN')
+
+    return D3MDS(dataset=dataset_path, problem=problem_path)
+
+
+def load_dataset(dataset, force_download=False):
+
+    d3mds = load_d3mds(dataset, force_download)
+
+    loader = get_loader(
+        d3mds.get_data_modality(),
+        d3mds.get_task_type()
+    )
+
+    dataset = loader.load(d3mds)
+
+    dataset.scorer = METRICS_DICT[d3mds.get_metric()]
+
+    return dataset
diff --git a/mit_d3m/config.py b/mit_d3m/config.py
new file mode 100644
index 0000000..3575080
--- /dev/null
+++ b/mit_d3m/config.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+import os
+
+from mit_d3m.utils import make_abs
+
+
+def build_config(dataset, datasets_dir, phase, problem=None, output_dir='data/output'):
+    """
+    root@d3m-example-pod:/# cat /input/185_baseball/test_config.json
+    {
+      "problem_schema": "/input/TEST/problem_TEST/problemDoc.json",
+      "problem_root": "/input/TEST/problem_TEST",
+      "dataset_schema": "/input/TEST/dataset_TEST/datasetDoc.json",
+      "test_data_root": "/input/TEST/dataset_TEST",
+      "results_root": "/output/predictions",
+      "executables_root": "/output/executables",
+      "temp_storage_root": "/output/supporting_files"
+    }
+    root@d3m-example-pod:/# cat /input/185_baseball/search_config.json
+    {
+      "problem_schema": "/input/TRAIN/problem_TRAIN/problemDoc.json",
+      "problem_root": "/input/TRAIN/problem_TRAIN",
+      "dataset_schema": "/input/TRAIN/dataset_TRAIN/datasetDoc.json",
+      "training_data_root": "/input/TRAIN/dataset_TRAIN",
+      "pipeline_logs_root": "/output/pipelines",
+      "executables_root": "/output/executables",
+      "user_problems_root": "/output/user_problems",
+      "temp_storage_root": "/output/supporting_files"
+    }
+
+    """
+
+    if problem:
+        full_phase = phase + '_' + problem
+    else:
+        full_phase = phase
+
+    root_dir = os.path.join(datasets_dir, dataset, full_phase)
+    problem_root = os.path.join(root_dir, 'problem_' + phase)
+    data_root = os.path.join(root_dir, 'dataset_' + phase)
+
+    config = {
+        'problem_root': problem_root,
+        'problem_schema': os.path.join(problem_root, 'problemDoc.json'),
+        'dataset_schema': os.path.join(data_root, 'datasetDoc.json'),
+        'executables_root': os.path.join(output_dir, 'executables'),
+        'temp_storage_root': os.path.join(output_dir, 'supporting_files'),
+    }
+
+    if phase == 'TRAIN':
+        config['training_data_root'] = data_root
+        config['pipeline_logs_root'] = os.path.join(output_dir, 'pipelines')
+    else:
+        config['test_data_root'] = data_root
+        config['results_root'] = os.path.join(output_dir, 'predictions')
+
+    return config
+
+
+PHASES = {
+    'TRAIN': 'search',
+    'TEST': 'test'
+}
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Generate D3M dataset config files')
+    parser.add_argument('-a', '--absolute', action='store_true')
+    parser.add_argument('-b', '--base-dir', default='data', nargs='?')
+    parser.add_argument('-d', '--datasets', default='datasets', nargs='?')
+    parser.add_argument('-o', '--output', default='output', nargs='?')
+    parser.add_argument('-c', '--config-dir', required=True)
+    parser.add_argument('-p', '--problem', default='', nargs='?')
+    parser.add_argument('dataset', nargs='+')
+
+    args = parser.parse_args()
+
+    if args.absolute:
+        base_dir = make_abs(args.base_dir, os.getcwd())
+        datasets = make_abs(args.datasets, base_dir)
+        output = make_abs(args.output, base_dir)
+    else:
+        base_dir = args.base_dir
+        datasets = os.path.join(base_dir, args.datasets)
+        output = os.path.join(base_dir, args.output)
+
+    for dataset in args.dataset:
+        for phase, phase_filename in PHASES.items():
+            config = build_config(dataset, datasets, phase, args.problem, output)
+
+            filename = '{}_{}.json'.format(dataset, phase_filename)
+            with open(os.path.join(args.config_dir, filename), 'w') as f:
+                json.dump(config, f, indent=4)
diff --git a/mit_d3m/dataset.py b/mit_d3m/dataset.py
new file mode 100644
index 0000000..ca6baed
--- /dev/null
+++ b/mit_d3m/dataset.py
@@ -0,0 +1,326 @@
+# -*- coding: utf-8 -*-
+
+import json
+import logging
+import os
+import re
+import warnings
+from urllib.parse import urlparse
+
+import networkx as nx
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+RE_PYTHONIZE = re.compile(r'[A-Z]')
+
+
+def pythonize(name):
+    pythonized = re.sub('[A-Z]', '_\g<0>', name).lower()
+    if pythonized.startswith('_'):
+        pythonized = pythonized[1:]
+
+    return pythonized
+
+
+DATASET_SCHEMA_VERSION = '3.0'
+PROBLEM_SCHEMA_VERSION = '3.0'
+
+
+class D3MDataset:
+    dsHome = None
+    dsDoc = None
+    learningDataFile = None
+
+    def _get_learning_data_path(self):
+        """
+        Returns the path of learningData.csv in a dataset
+        """
+        for res in self.dsDoc['dataResources']:
+            resPath = res['resPath']
+            resType = res['resType']
+
+            dirname = os.path.basename(os.path.normpath(os.path.dirname(resPath)))
+
+            if resType == 'table' and dirname == 'tables':
+                if 'learningData.csv' in res['resPath']:
+                    return os.path.join(self.dsHome, resPath)
+
+        # if the for loop is over and learningDoc is not found, then return None
+        raise RuntimeError('could not find learningData file the dataset')
+
+    def __init__(self, dataset):
+        # handle uris
+        logger.info("Loading dataset: %s", dataset)
+        dataset = urlparse(dataset).path
+        self.dsHome = dataset
+
+        # read the schema in dsHome
+        if os.path.isdir(dataset):
+            self.dsHome = dataset
+            _dsDoc = os.path.join(self.dsHome, 'datasetDoc.json')
+        else:
+            self.dsHome = os.path.dirname(dataset)
+            _dsDoc = dataset
+
+        assert os.path.exists(_dsDoc), _dsDoc
+        with open(_dsDoc, 'r') as f:
+            self.dsDoc = json.load(f)
+
+        # make sure the versions line up
+        if self.get_dataset_schema_version() != DATASET_SCHEMA_VERSION:
+            warnings.warn("the datasetSchemaVersions in the API and datasetDoc do not match!")
+
+        # locate the special learningData file
+        self.learningDataFile = self._get_learning_data_path()
+
+    def get_datasetID(self):
+        """Get the datasetID from datasetDoc."""
+        return self.dsDoc['about']['datasetID']
+
+    def get_dataset_schema_version(self):
+        """Get the dataset schema version that was used to create this dataset."""
+        return self.dsDoc['about']['datasetSchemaVersion']
+
+    def get_learning_data(self):
+        """Get the contents of learningData.doc as a DataFrame."""
+        return pd.read_csv(self.learningDataFile, index_col='d3mIndex')
+
+    def _get_learning_data_resource(self):
+        """
+        Returns the path of learningData.csv in a dataset
+        """
+        for res in self.dsDoc['dataResources']:
+            resPath = res['resPath']
+            resType = res['resType']
+            if resType == 'table':
+                if 'learningData.csv' in resPath:
+                    return res
+                else:
+                    raise RuntimeError('could not find learningData.csv')
+
+        # if the for loop is over and learningDoc is not found, then return None
+        raise RuntimeError('could not find learningData resource')
+
+    def get_learning_data_columns(self):
+        res = self._get_learning_data_resource()
+        return res['columns']
+
+    def get_resource_types(self):
+        return [dr["resType"] for dr in self.dsDoc['dataResources']]
+
+    def get_data_modality(self):
+        """Detect the data modality based on the resource_types.
+
+        resource_types == ['table'] => 'single_table'
+        resource_types == ['something_else'...] => 'something_else'   # this is not likely
+        resource_types == ['table', 'table'...] => 'multi_table'
+        resource_types == ['table', 'something_else'...] => 'something_else'
+        """
+        resource_types = self.get_resource_types()
+        first_type = resource_types[0]
+        if first_type != 'table':
+            return first_type
+
+        elif len(resource_types) == 1:
+            return 'tabular'
+
+        else:
+            second_type = resource_types[1]
+            if second_type == 'table':
+                return 'tabular'
+
+            return second_type
+
+    def get_image_path(self):
+        """
+        Returns the path of the directory containing images if they exist in this dataset.
+        """
+        for res in self.dsDoc['dataResources']:
+            resPath = res['resPath']
+            resType = res['resType']
+            isCollection = res['isCollection']
+
+            if resType == 'image' and isCollection:
+                return os.path.join(self.dsHome, resPath)
+
+        # if the for loop is over and no image directory is found, then return None
+        raise RuntimeError('could not find learningData file the dataset')
+
+    def get_graph_resources(self):
+        return [r for r in self.dsDoc['dataResources'] if r["resType"] == "graph"]
+
+    def get_graphs_as_nx(self):
+        graph_res = self.get_graph_resources()
+
+        graphs = {}
+        # todo allow more than one graph resource
+        for g in graph_res:
+            graph_path = os.path.join(self.dsHome, g["resPath"])
+            try:
+                graphs[g['resID']] = nx.read_gml(graph_path)
+            except nx.exception.NetworkXError:
+                graphs[g['resID']] = nx.read_gml(graph_path, label='id')
+
+        return graphs
+
+    def _get_resources_by_type(self, resource_type):
+        """
+        Returns the list of resources that are of the indicated type
+        """
+        resources = []
+        for res in self.dsDoc['dataResources']:
+            if res['resType'] == resource_type:
+                resources.append(res)
+
+        return resources
+
+    def get_related_resource_names(self, resource_type):
+        related_names = dict()
+        related_resources = self._get_resources_by_type(resource_type)
+        related_resources = {r['resID'] for r in related_resources}
+
+        for column in self.get_learning_data_columns():
+            refers_to = column.get('refersTo')
+            if refers_to:
+                res_id = refers_to['resID']
+                if res_id in related_resources:
+                    related_names[column['colName']] = res_id
+
+        return related_names
+
+    def get_text_path(self):
+        """
+        Returns the path of the directory containing text if they exist in this dataset.
+        """
+        for res in self.dsDoc['dataResources']:
+            resPath = res['resPath']
+            resType = res['resType']
+            isCollection = res['isCollection']
+            if resType == 'text' and isCollection:
+                return os.path.join(self.dsHome, resPath)
+
+        # if the for loop is over and no image directory is found, then return None
+        raise RuntimeError('could not find learningData file the dataset')
+
+
+class D3MProblem:
+    prHome = None
+    prDoc = None
+    splitsFile = None
+
+    def __init__(self, problem):
+        if isinstance(problem, dict):
+            self.prDoc = problem
+        else:
+            self.prHome = problem
+
+            # read the schema in prHome
+            _prDoc = os.path.join(self.prHome, 'problemDoc.json')
+            assert os.path.exists(_prDoc), _prDoc
+            with open(_prDoc, 'r') as f:
+                self.prDoc = json.load(f)
+
+        # make sure the versions line up
+        if self.get_problem_schema_version() != PROBLEM_SCHEMA_VERSION:
+            warnings.warn("the problemSchemaVersions in the API and datasetDoc do not match!")
+
+    def get_task_type(self):
+        return self.prDoc["about"].get("taskType", "")
+
+    def get_task_subtype(self):
+        return self.prDoc["about"].get("taskSubType", "")
+
+    def get_problem_id(self):
+        """Get the problemID from problemDoc."""
+        return self.prDoc['about']['problemID']
+
+    def get_problem_schema_version(self):
+        """Get the problem schema version that was used to create this dataset."""
+        return self.prDoc['about']['problemSchemaVersion']
+
+    def get_performance_metrics(self):
+        return self.prDoc['inputs']['performanceMetrics']
+
+    def get_target_column_names(self):
+        targets = self.prDoc['inputs']['data'][0]['targets']
+        target_columns = []
+        for target in targets:
+            target_columns.append(target['colName'])
+
+        return target_columns
+
+
+class D3MDS:
+    dataset = None
+    problem = None
+
+    def __init__(self, dataset, problem):
+        if isinstance(dataset, D3MDataset):
+            self.dataset = dataset
+        else:
+            self.dataset = D3MDataset(dataset)
+
+        if isinstance(problem, D3MProblem):
+            self.problem = problem
+        else:
+            self.problem = D3MProblem(problem)
+
+        self.dataset_doc = self.dataset.dsDoc
+        self.problem_doc = self.problem.prDoc
+        self.dataset_root = self.dataset.dsHome
+        self.dataset_id = self.dataset.get_datasetID()
+        self.target_column = self.problem.get_target_column_names()[0]
+
+    def get_data_all(self, dropTargets=False):
+        df = self.dataset.get_learning_data()
+        if dropTargets:
+            df.drop(self.target_columns, axis=1, inplace=True, errors='ignore')
+
+        return df
+
+    def get_data(self, targets=True, limit=None):
+        df = self.dataset.get_learning_data().head(limit)
+
+        y = df[self.target_column]
+        X = df.drop(self.target_column, axis=1, errors='ignore')
+
+        if targets:
+            return X, y
+
+        else:
+            return X
+
+    def get_targets(self, limit=None):
+        df = self.dataset.get_learning_data().head(limit)
+        return df[self.target_column]
+
+    def get_columns(self):
+        return self.dataset.get_learning_data_columns()
+
+    def get_resources_dir(self, data_modality):
+        if data_modality == 'image':
+            return self.dataset.get_image_path()
+        if data_modality == 'text':
+            return self.dataset.get_text_path()
+
+    def get_related_resources(self, data_modality):
+        return self.dataset.get_related_resource_names(data_modality)
+
+    def load_graphs(self):
+        return self.dataset.get_graphs_as_nx()
+
+    def get_data_modality(self):
+        return self.dataset.get_data_modality()
+
+    def get_problem_id(self):
+        return self.problem.get_problem_id()
+
+    def get_task_type(self):
+        return pythonize(self.problem.get_task_type())
+
+    def get_task_subtype(self):
+        return pythonize(self.problem.get_task_subtype())
+
+    def get_metric(self):
+        return self.problem.get_performance_metrics()[0]['metric']
diff --git a/mit_d3m/db.py b/mit_d3m/db.py
new file mode 100644
index 0000000..d8e3d77
--- /dev/null
+++ b/mit_d3m/db.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+import getpass
+import json
+import logging
+
+from pymongo import MongoClient
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_db(database=None, config=None, **kwargs):
+    if config:
+        with open(config, 'r') as f:
+            config = json.load(f)
+    else:
+        config = kwargs
+
+    host = config.get('host', 'localhost')
+    port = config.get('port', 27017)
+    user = config.get('user')
+    password = config.get('password')
+    database = database or config.get('database', 'test')
+    auth_database = config.get('auth_database', 'admin')
+
+    if user and not password:
+        password = getpass.getpass(prompt='Please insert database password: ')
+
+    client = MongoClient(
+        host=host,
+        port=port,
+        username=user,
+        password=password,
+        authSource=auth_database
+    )
+
+    LOGGER.info("Setting up a MongoClient %s", client)
+
+    return client[database]
diff --git a/mit_d3m/loaders.py b/mit_d3m/loaders.py
new file mode 100644
index 0000000..42ae3cb
--- /dev/null
+++ b/mit_d3m/loaders.py
@@ -0,0 +1,505 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+from collections import OrderedDict
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+from keras.preprocessing.image import img_to_array, load_img
+
+from mit_d3m.utils import available_memory, used_memory
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Dataset:
+
+    def __init__(self, name, X=None, y=None, context=None):
+        self.name = name
+        self.X = X
+        self.y = y
+        self.context = context or dict()
+
+    def __repr__(self):
+        attributes = ["'{}'".format(self.name)]
+        for attribute in ['X', 'y', 'context']:
+            if getattr(self, attribute) is not None:
+                attributes.append(attribute)
+
+        return "Dataset({})".format(', '.join(attributes))
+
+    def get_split(self, indexes):
+        X = self.X
+        if hasattr(X, 'iloc'):
+            X = X.iloc[indexes]
+        else:
+            X = X[indexes]
+
+        y = self.y
+        if y is not None:
+            if hasattr(y, 'iloc'):
+                y = y.iloc[indexes]
+            else:
+                y = y[indexes]
+
+        return X, y
+
+
+class Loader(object):
+
+    def __init__(self, data_modality, task_type):
+        self.data_modality = data_modality
+        self.task_type = task_type
+
+    def load(self, d3mds):
+        """Load X, y and context from D3MDS."""
+        X, y = d3mds.get_data()
+
+        return Dataset(d3mds.dataset_id, X, y)
+
+    def to_dict(self):
+        return {
+            'data_modality': self.data_modality,
+            'task_type': self.task_type,
+        }
+
+def features_by_type(column_types, columns):
+    if not isinstance(column_types, list):
+        column_types = [column_types]
+
+    features = []
+    for column in columns:
+        is_of_type = column['colType'] in column_types
+        target = column['role'] == ['suggestedTarget']
+        if is_of_type and not target:
+            features.append(column['colName'])
+
+    return features
+
+
+class TabularLoader(Loader):
+
+    @staticmethod
+    def find_privileged_features(dataset_doc, tables):
+        privileged_features = dict()
+        for quality in dataset_doc.get('qualities', []):
+            privileged_quality = quality['qualName'] == 'privilegedFeature'
+            privileged_true = quality['qualValue'] == 'True'
+            restricted_to = quality.get('restrictedTo')
+
+            if privileged_quality and privileged_true and restricted_to:
+
+                res_id = restricted_to['resID']
+                privileged_feature = privileged_features.setdefault(res_id, list())
+
+                res_component = restricted_to.get('resComponent')
+                if res_component is not None:
+                    column_name = res_component.get('columnName')
+                    if column_name is None:
+                        column_index = res_component.get('columnIndex')
+                        if column_index is not None:
+                            column_name = tables[res_id]['columns'][column_index]['columnName']
+
+                    if column_name:
+                        privileged_feature.append(column_name)
+
+        return privileged_features
+
+    @classmethod
+    def remove_privileged_features(cls, dataset_doc, tables):
+        privileged_features = cls.find_privileged_features(dataset_doc, tables)
+        for res_id, columns in privileged_features.items():
+            if columns and res_id in tables:
+                tables[res_id]['data'].drop(columns, axis=1, inplace=True)
+
+    @staticmethod
+    def map_dtype_to_d3m_type(dtype):
+        if 'int' in str(dtype):
+            return 'integer'
+        elif 'float' in str(dtype):
+            return 'real'
+        elif 'str' in str(dtype):
+            return 'string'
+        elif 'object' in str(dtype):
+            return 'categorical'
+        elif 'date' in str(dtype):
+            return 'dateTime'
+        elif 'bool' in str(dtype):
+            return 'boolean'
+        else:
+            return 'categorical'
+
+    @classmethod
+    def load_timeseries(cls, learning_data, timeseries_column, timeseries_res_path):
+
+        dataframes = []
+        for d3m_index, row in learning_data.iterrows():
+            filename = row[timeseries_column]
+            df = pd.read_csv(os.path.join(timeseries_res_path, filename))
+            df['d3mIndex'] = d3m_index
+            dataframes.append(df)
+
+        index_column = "timeseries_index"
+
+        full_df = pd.concat(dataframes)
+        full_df.reset_index(inplace=True, drop=True)
+        full_df.index.name = index_column
+        full_df.reset_index(inplace=True, drop=False)
+
+        columns = {
+            column: {
+                'colIndex': index,
+                'colName': column,
+                'colType': cls.map_dtype_to_d3m_type(full_df[column].dtype)
+            }
+            for index, column in enumerate(full_df)
+        }
+
+        time_index = None
+        if 'time' in full_df.columns:
+            time_index = 'time'
+
+        return {
+            'columns': columns,
+            'data': full_df,
+            'index': index_column,
+            'time_index': time_index
+        }
+
+    @staticmethod
+    def get_timeseries_column(timeseries_res_id, learning_data_columns):
+        for name, details in learning_data_columns.items():
+            refers_to = details.get('refersTo', dict()).get('resID')
+            if refers_to == timeseries_res_id:
+                return name
+
+    @classmethod
+    def load_tables(cls, d3mds):
+        """Load tables and timeseries as DataFrames."""
+
+        tables = dict()
+        main_table = None
+        timeseries = None
+
+        for res in d3mds.dataset_doc['dataResources']:
+
+            res_path = res['resPath']
+            res_type = res['resType']
+            res_id = res['resID']
+
+            table_name = res_path.split('.')[0]
+            table_name = table_name.replace("tables/", "")
+            if table_name.endswith('/'):
+                table_name = table_name[:-1]
+
+            dirname = os.path.basename(os.path.normpath(os.path.dirname(res_path)))
+
+            if res_type == 'table' and dirname == 'tables':
+
+                columns = dict()
+                index_column = None
+                time_index_column = None
+                target_column = None
+
+                for column in res['columns']:
+                    column_name = column['colName']
+
+                    if 'suggestedTarget' in column['role']:
+                        if target_column:
+                            raise ValueError("Multiple targets found")
+
+                        target_column = column_name
+
+                    else:
+
+                        columns[column_name] = column
+                        if 'index' in column['role']:
+                            if index_column:
+                                raise ValueError("Multiple indexes found")
+
+                            index_column = column_name
+
+                        if 'timeIndicator' in column['role']:
+                            if time_index_column:
+                                raise ValueError("Multiple indexes found")
+
+                            time_index_column = column_name
+
+                df = pd.read_csv(os.path.join(d3mds.dataset_root, res_path))
+
+                if index_column:
+                    df.set_index(index_column, drop=False, inplace=True)
+
+                if target_column:
+                    df.drop(target_column, axis=1, errors='ignore', inplace=True)
+
+                table = {
+                    'res_id': res_id,
+                    'table_name': table_name,
+                    'columns': columns,
+                    'data': df,
+                    'index': index_column,
+                    'time_index': time_index_column
+                }
+
+                if 'learningData.csv' in res_path:
+                    main_table = table
+                    table['main'] = True
+                else:
+                    table['main'] = False
+
+                tables[res_id] = table
+
+            elif res_type == 'timeseries':
+                timeseries = {
+                    'res_path': os.path.join(d3mds.dataset_root, res_path),
+                    'res_id': res_id,
+                    'table_name': table_name
+                }
+
+        if main_table is None:
+            raise RuntimeError('Main table not found')
+
+        if timeseries:
+            timeseries_res_id = timeseries['res_id']
+
+            timeseries_column = cls.get_timeseries_column(
+                timeseries_res_id,
+                main_table['columns']
+            )
+
+            if timeseries_column:
+                main_data = main_table['data']
+                timeseries_path = timeseries['res_path']
+
+                table = cls.load_timeseries(main_data, timeseries_column, timeseries_path)
+                timeseries.update(table)
+
+                del main_data[timeseries_column]
+
+                tables[timeseries_res_id] = timeseries
+
+        cls.remove_privileged_features(d3mds.dataset_doc, tables)
+
+        return tables
+
+    @staticmethod
+    def get_relationships(tables):
+        relationships = []
+        table_names = {
+            table['res_id']: table['table_name']
+            for table in tables.values()
+        }
+
+        for table in tables.values():
+            columns = table['columns']
+            df = table['data']
+            table_name = table['table_name']
+
+            for column_name, column in columns.items():
+                refers_to = column.get('refersTo')
+
+                if refers_to:
+                    res_id = refers_to['resID']
+                    res_obj = refers_to['resObject']
+
+                    foreign_table_name = table_names[res_id]
+
+                    if column_name in df.columns and isinstance(res_obj, dict):
+
+                        foreign_table_name = table_names[res_id]
+
+                        if 'columnIndex' in res_obj:
+                            column_index = res_obj['columnIndex']
+                            foreign_column_name = table['columns'][column_index]['colName']
+
+                        else:
+                            foreign_column_name = res_obj['columnName']
+
+                        relationships.append((
+                            foreign_table_name,
+                            foreign_column_name,
+                            table_name,
+                            column_name,
+                        ))
+
+                    elif table['main'] and res_obj == 'item':
+                        foreign_column_name = 'd3mIndex'
+                        column_name = 'd3mIndex'
+
+                        relationships.append((
+                            table_name,
+                            column_name,
+                            foreign_table_name,
+                            foreign_column_name,
+                        ))
+
+        return relationships
+
+    def load(self, d3mds):
+        X, y = d3mds.get_data()
+
+        tables = self.load_tables(d3mds)
+        relationships = self.get_relationships(tables)
+
+        entities = dict()
+        for table in tables.values():
+            entities[table['table_name']] = (
+                table['data'],
+                table['index'],
+                table['time_index']
+            )
+
+        context = {
+            'target_entity': 'learningData',
+            'entities': entities,
+            'relationships': relationships
+        }
+
+        return Dataset(d3mds.dataset_id, X, y, context)
+
+
+class ResourceLoader(Loader):
+
+    def load_resources(self, resources_names, d3mds):
+        raise NotImplementedError
+
+    def get_context(self, X, y):
+        return None
+
+    def get_resources_column(self, d3mds):
+        related_names = d3mds.get_related_resources(self.data_modality)
+
+        if len(related_names) != 1:
+            raise ValueError("Inconsistent number of related resources %s" % related_names)
+
+        return list(related_names.keys())[0]
+
+    def load(self, d3mds):
+        """Load X, y and context from D3MDS."""
+        X, y = d3mds.get_data()
+
+        resources_name_column = self.get_resources_column(d3mds)
+        X = self.load_resources(X[resources_name_column], d3mds)
+
+        context = self.get_context(X, y)
+
+        return Dataset(d3mds.dataset_id, X, y, context=context)
+
+
+class ImageLoader(ResourceLoader):
+
+    INPUT_SHAPE = [224, 224, 3]
+    EPOCHS = 1
+
+    def load_resources(self, X, d3mds):
+        LOGGER.info("Loading %s images", len(X))
+
+        image_dir = d3mds.get_resources_dir('image')
+        images = []
+
+        for filename in X:
+            if used_memory() > available_memory():
+                raise MemoryError()
+
+            filename = os.path.join(image_dir, filename)
+            image = load_img(filename)
+            image = image.resize(tuple(self.INPUT_SHAPE[0:2]))
+            image = img_to_array(image)
+            image = image / 255.0  # Quantize images.
+            images.append(image)
+
+        return pd.Series(np.array(images), index=X.index)
+
+
+class TextLoader(ResourceLoader):
+
+    EPOCHS = 5
+
+    def load_resources(self, X, d3mds):
+        texts_dir = d3mds.get_resources_dir('text')
+        texts = []
+        for filename in X:
+            with open(os.path.join(texts_dir, filename), 'r') as text_file:
+                texts.append(text_file.read())
+
+        texts = pd.Series(texts, name='texts', index=X.index)
+
+        return pd.DataFrame(texts)
+
+
+class GraphLoader(Loader):
+
+    def load_graphs(self, d3mds, max_graphs=2):
+        graphs = d3mds.load_graphs()
+        node_columns = d3mds.get_related_resources(self.data_modality)
+
+        graph_names = OrderedDict()
+        for _, (column, graph_id) in zip(range(max_graphs), node_columns.items()):
+            graph_names[column] = nx.Graph(graphs[graph_id])
+
+        return graph_names
+
+    def get_context(self, X, d3mds):
+        if self.task_type == 'community_detection':
+            graphs = self.load_graphs(d3mds, 1)
+            column, graph = list(graphs.items())[0]
+            context = {
+                'graph': graph,
+            }
+
+        elif self.task_type == 'link_prediction':
+            graphs = self.load_graphs(d3mds, 2)
+            columns = list(graphs.keys())
+            context = {
+                'node_columns': columns,
+                'graph': graphs[columns[-1]]
+            }
+
+        elif self.task_type == 'vertex_nomination':
+            graphs = self.load_graphs(d3mds, 1)
+            context = {
+                'graphs': graphs
+            }
+
+        elif self.task_type == 'graph_matching':
+            graphs = self.load_graphs(d3mds, 2)
+            columns = list(graphs.keys())
+            graph_0, graph_1 = tuple(graphs.values())
+
+            pairs = X[columns].values
+            graph = graph_0.copy()
+            graph.add_nodes_from(graph_1.nodes(data=True))
+            graph.add_edges_from(graph_1.edges)
+            graph.add_edges_from(pairs)
+
+            context = {
+                'node_columns': columns,
+                'graph': graph,
+                'graphs': graphs
+            }
+
+        return context
+
+    def load(self, d3mds):
+        X, y = d3mds.get_data()
+
+        context = self.get_context(X, d3mds)
+
+        return Dataset(d3mds.dataset_id, X, y, context=context)
+
+
+_LOADERS = {
+    'tabular': TabularLoader,
+    'timeseries': TabularLoader,
+    'image': ImageLoader,
+    'text': TextLoader,
+    'graph': GraphLoader,
+}
+
+
+def get_loader(data_modality, task_type):
+    loader_class = _LOADERS.get(data_modality, Loader)
+    return loader_class(data_modality, task_type)
diff --git a/mit_d3m/metrics.py b/mit_d3m/metrics.py
new file mode 100644
index 0000000..3025680
--- /dev/null
+++ b/mit_d3m/metrics.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+import warnings
+
+from sklearn import metrics
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.preprocessing import LabelBinarizer
+
+warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
+
+
+def accuracy(ground_truth, predicted):
+    return metrics.accuracy_score(ground_truth, predicted)
+
+
+def f1(ground_truth, predicted):
+    return metrics.f1_score(ground_truth, predicted)
+
+
+def f1_micro(ground_truth, predicted):
+    return metrics.f1_score(ground_truth, predicted, average='micro')
+
+
+def f1_macro(ground_truth, predicted):
+    return metrics.f1_score(ground_truth, predicted, average='macro')
+
+
+def roc_auc(ground_truth, predicted):
+    return metrics.roc_auc_score(ground_truth, predicted)
+
+
+def roc_auc_micro(ground_truth, predicted):
+    ground_truth, predicted = _binarize(ground_truth, predicted)
+    return metrics.roc_auc_score(ground_truth, predicted, average='micro')
+
+
+def roc_auc_macro(ground_truth, predicted):
+    ground_truth, predicted = _binarize(ground_truth, predicted)
+    return metrics.roc_auc_score(ground_truth, predicted, average='macro')
+
+
+def l2(ground_truth, predicted):
+    return (metrics.mean_squared_error(ground_truth, predicted))**0.5
+
+
+def avg_l2(ground_truth_l, predicted_l):
+    l2_sum = 0.0
+    count = 0
+    for pair in zip(ground_truth_l, predicted_l):
+        l2_sum += l2(pair[0], pair[1])
+        count += 1
+    return l2_sum / count
+
+
+def l1(ground_truth, predicted):
+    return metrics.mean_absolute_error(ground_truth, predicted)
+
+
+def r2(ground_truth, predicted):
+    return metrics.r2_score(ground_truth, predicted)
+
+
+def norm_mut_info(ground_truth, predicted):
+    return metrics.normalized_mutual_info_score(ground_truth, predicted)
+
+
+def jacc_sim(ground_truth, predicted):
+    return metrics.jaccard_similarity_score(ground_truth, predicted)
+
+
+def mean_se(ground_truth, predicted):
+    return metrics.mean_squared_error(ground_truth, predicted)
+
+
+def _binarize(ground, pred):
+    label_binarizer = LabelBinarizer()
+    return label_binarizer.fit_transform(ground), label_binarizer.transform(pred)
+
+
+# MIT LL defined these strings here:
+# https://gitlab.datadrivendiscovery.org/MIT-LL/d3m_data_supply/blob/shared/documentation/problemSchema.md#performance-metrics
+METRICS_DICT = {
+    'accuracy': accuracy,
+    'f1': f1,
+    'f1Micro': f1_micro,
+    'f1Macro': f1_macro,
+    'rocAuc': roc_auc,
+    'rocAucMicro': roc_auc_micro,
+    'rocAucMacro': roc_auc_macro,
+    'meanSquaredError': mean_se,
+    'rootMeanSquaredError': l2,
+    'rootMeanSquaredErrorAvg': avg_l2,
+    'meanAbsoluteError': l1,
+    'rSquared': r2,
+    'normalizedMutualInformation': norm_mut_info,
+    'jaccardSimilarityScore': jacc_sim
+}
diff --git a/mit_d3m/stats.py b/mit_d3m/stats.py
new file mode 100644
index 0000000..2e11914
--- /dev/null
+++ b/mit_d3m/stats.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import logging
+import os
+
+import pandas as pd
+
+from mit_d3m.config import build_config
+from mit_d3m.dataset import D3MDS
+from mit_d3m.utils import disk_usage, logging_setup, make_abs
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_d3mds(dataset, path, phase, problem):
+    config = build_config(dataset, path, phase, problem)
+    dataset_key = 'training' if phase == 'TRAIN' else 'test'
+    d3mds = D3MDS(
+        datasetPath=config[dataset_key + '_data_root'],
+        problemPath=config['problem_root']
+    )
+    return d3mds
+
+
+def get_dataset_stats(dataset, path, problem):
+    train_d3mds = get_d3mds(dataset, path, 'TRAIN', problem)
+    test_d3mds = get_d3mds(dataset, path, 'TEST', problem)
+
+    train_shape = train_d3mds.get_data(False).shape
+    test_shape = test_d3mds.get_data(False).shape
+
+    size = disk_usage(os.path.join(path, dataset, dataset + '_dataset'))
+    size_human = disk_usage(os.path.join(path, dataset, dataset + '_dataset'), True)
+
+    if problem:
+        dataset = dataset + '_' + problem
+
+    return {
+        'dataset': dataset,
+        'dataset_id': train_d3mds.dataset_id,
+        'problem_id': train_d3mds.problem_id,
+        'data_modality': train_d3mds.get_data_modality(),
+        'task_type': train_d3mds.get_task_type(),
+        'task_subtype': train_d3mds.get_task_subtype(),
+        'metric': train_d3mds.get_metric(),
+        'target': train_d3mds.target_column,
+        'train_samples': train_shape[0],
+        'train_features': train_shape[1],
+        'test_samples': test_shape[0],
+        'test_features': test_shape[1],
+        'size': size,
+        'size_human': size_human
+    }
+
+
+def get_problems(dataset, path):
+    dataset_path = os.path.join(path, dataset)
+    folders = os.listdir(dataset_path)
+    problems = []
+    for folder in folders:
+        if folder == 'TRAIN':
+            problems.append(None)
+
+        if folder.startswith('TRAIN_'):
+            problems.append(folder.replace('TRAIN_', ''))
+
+    return problems
+
+
+def get_stats(datasets, path):
+    data = []
+    for dataset in datasets:
+        for problem in get_problems(dataset, path):
+            try:
+                stats = get_dataset_stats(dataset, path, problem)
+            except Exception as e:
+                LOGGER.exception("Exception in dataset %s", dataset)
+                stats = {
+                    'dataset': dataset,
+                    'error': str(e),
+                }
+
+            data.append(stats)
+
+    return pd.DataFrame(data)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Get Dataset summary')
+    parser.add_argument('-i', '--input', default='data/datasets', nargs='?')
+    parser.add_argument('-o', '--output', nargs='?')
+    parser.add_argument('datasets', nargs='*')
+
+    args = parser.parse_args()
+
+    logging_setup()
+
+    args.input = make_abs(args.input, os.getcwd())
+
+    if not args.datasets:
+        args.datasets = os.listdir(args.input)
+
+    print("Processing Datasets: {}".format(args.datasets))
+
+    output = get_stats(args.datasets, args.input)
+
+    if args.output:
+        print("Storing report as {}".format(args.output))
+        output.to_csv(args.output, index=False)
+
+    else:
+        print(output)
diff --git a/mit_d3m/utils.py b/mit_d3m/utils.py
new file mode 100644
index 0000000..286ce95
--- /dev/null
+++ b/mit_d3m/utils.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import subprocess
+
+import psutil
+
+
+def make_abs(path, base_dir=None):
+    base_dir = base_dir or os.getcwd()
+    if not os.path.isabs(path):
+        return os.path.join(base_dir, path)
+
+    return path
+
+
+def used_memory():
+    return psutil.Process(os.getpid()).memory_info().rss
+
+
+def available_memory():
+    return psutil.virtual_memory().available
+
+
+def disk_usage(path, human=False):
+    """disk usage in bytes or human readable format (e.g. '2,1GB')"""
+    command = ['du', '-s', path]
+    if human:
+        command.append('-h')
+
+    return subprocess.check_output(command).split()[0].decode('utf-8')
+
+
+def walk(document, transform):
+    if not isinstance(document, dict):
+        return document
+
+    new_doc = dict()
+    for key, value in document.items():
+        if isinstance(value, dict):
+            value = walk(value, transform)
+        elif isinstance(value, list):
+            value = [walk(v, transform) for v in value]
+
+        new_key, new_value = transform(key, value)
+        new_doc[new_key] = new_value
+
+    return new_doc
+
+
+def remove_dots(document):
+    return walk(document, lambda key, value: (key.replace('.', '-'), value))
+
+
+def restore_dots(document):
+    return walk(document, lambda key, value: (key.replace('-', '.'), value))
+
+
+def logging_setup(verbosity=1, logfile=None, logger_name=None):
+    logger = logging.getLogger(logger_name)
+    log_level = (3 - verbosity) * 10
+    fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s'
+    formatter = logging.Formatter(fmt)
+    logger.setLevel(log_level)
+    logger.propagate = False
+
+    if logfile:
+        file_handler = logging.FileHandler(logfile)
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+    else:
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(log_level)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..2b9be21
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,44 @@
+[bumpversion]
+current_version = 0.1.0-dev
+commit = True
+tag = True
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
+serialize =
+    {major}.{minor}.{patch}-{release}
+    {major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = release
+values =
+    dev
+    release
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:mit_d3m/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
+
+[bdist_wheel]
+universal = 1
+
+[flake8]
+max-line-length = 99
+exclude = .tox, .git, __pycache__, .ipynb_checkpoints
+ignore = # Keep empty to prevent default ignores
+
+[isort]
+include_trailing_comment = True
+line_length = 99
+lines_between_types = 0
+multi_line_output = 4
+not_skip = __init__.py
+use_parentheses = True
+
+[aliases]
+test = pytest
+
+[tool:pytest]
+collect_ignore = ['setup.py']
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ad6135c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from setuptools import setup, find_packages
+
+with open('README.md') as readme_file:
+    readme = readme_file.read()
+
+with open('HISTORY.md') as history_file:
+    history = history_file.read()
+
+install_requires = [
+    'baytune==0.2.1',
+    'boto==2.48.0',
+    'featuretools==0.2.0',
+    'mlblocks==0.2.3',
+    'mlprimitives==0.1.2',
+    'networkx==2.1',
+    'numpy==1.15.2',
+    'pandas==0.23.4',
+    'pymongo==3.7.2',
+    'requests==2.18.4',
+    'scikit-learn==0.20.0',
+    'scipy==1.1.0',
+]
+
+setup_requires = [
+    'pytest-runner>=2.11.1',
+]
+
+tests_require = [
+    'coverage>=4.5.1',
+    'pytest>=3.4.2',
+    'tox>=2.9.1',
+]
+
+development_requires = [
+    # general
+    'bumpversion>=0.5.3',
+    'pip>=9.0.1',
+    'watchdog>=0.8.3',
+
+    # docs
+    'm2r>=0.2.0',
+    'Sphinx>=1.7.1',
+    'sphinx_rtd_theme>=0.2.4',
+
+    # style check
+    'flake8>=3.5.0',
+    'isort>=4.3.4',
+
+    # fix style issues
+    'autoflake>=1.1',
+    'autopep8>=1.3.5',
+
+    # distribute on PyPI
+    'twine>=1.10.0',
+    'wheel>=0.30.0',
+]
+
+
+setup(
+    author="MIT Data To AI Lab",
+    author_email='dailabmit@gmail.com',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ],
+    description="MIT tools to work with D3M datasets.",
+    extras_require={
+        'test': tests_require,
+        'dev': development_requires + tests_require,
+    },
+    include_package_data=True,
+    install_requires=install_requires,
+    keywords='mit-d3m',
+    license="MIT license",
+    long_description=readme + '\n\n' + history,
+    long_description_content_type='text/markdown',
+    name='mit-d3m',
+    packages=find_packages(include=['mit_d3m', 'mit_d3m.*']),
+    python_requires='>=3.4',
+    setup_requires=setup_requires,
+    test_suite='tests',
+    tests_require=tests_require,
+    url='https://github.com/HDI-Project/mit-d3m',
+    version='0.1.0-dev',
+    zip_safe=False,
+)
diff --git a/tests/test_mit_d3m.py b/tests/test_mit_d3m.py
new file mode 100644
index 0000000..36a9565
--- /dev/null
+++ b/tests/test_mit_d3m.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Tests for `mit_d3m` package."""
+
+import unittest
+
+# from mit_d3m import mit_d3m
+
+
+class TestMit_d3m(unittest.TestCase):
+    """Tests for `mit_d3m` package."""
+
+    def setUp(self):
+        """Set up test fixtures, if any."""
+        pass
+
+    def tearDown(self):
+        """Tear down test fixtures, if any."""
+        pass
+
+    def test_000_something(self):
+        """Test something."""
+        self.assertTrue(True)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..cc444fc
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,29 @@
+[tox]
+envlist = py35, py36, lint, docs
+
+
+[travis]
+python =
+    3.6: py36, lint, docs
+    3.5: py35
+
+
+[testenv]
+setenv =
+    PYTHONPATH = {toxinidir}
+commands =
+    /usr/bin/env python setup.py test
+
+
+[testenv:lint]
+skipsdist = true
+extras = dev
+commands =
+    /usr/bin/env make lint
+
+
+[testenv:docs]
+skipsdist = true
+extras = dev
+commands =
+    /usr/bin/env make docs