makedocs.py

import pandas as pd
import os, shutil
from jinja2 import Environment, PackageLoader
import yaml
import os.path
import json
import markdown

class FieldError(Exception):
    """
    Class to inform the user of a field error in her or
    his yaml configuration.
    """
    def __init__(self, dataset, field):
        self.dataset = dataset
        self.field = field
        self.value = "%s field in the %s dataset (is the %s field missing?)" % (self.field, self.dataset, self.field)
    def __str__(self):
        return repr(self.field)

class Dataset():
    """
    A dataset is a csv file that is either in the root /data directory or housed in
    a DataFolder anywhere in /data.

    Args:
        name                String full name of this dataset (such as "test.csv")
        title               String human readable title
        description         String description
    """
    def __init__(self, name, title = None, description = None):
        self.name = name
        self.title = title
        self.description = description
        self.categories = []

    def addCategory(self, category):
        """
        Add a category object to the category list.
        Args:
            category    Category object
        """
        self.categories.append(category)

    def getFieldNames(self):
        """
        Returns a list of field names in this dataset
        """
        fieldNames = []
        for category in self.categories:
            for field in category.fields:
                fieldNames.append(field.name)
        return fieldNames

    def getHtmlName(self, appendText=None):
        """
        Returns a string takign the file name and turning it into a
        reasonable html file name that strips white space and .csv

        Args:
            appendText      String optional text to append to the end of the
                            html file name.
        """
        htmlName = self.name.replace(' ', '_')
        htmlName = htmlName.replace('.csv', '')
        if appendText:
            htmlName += "_" + appendText
        htmlName += '.html'
        return htmlName

    def addUncategorizedFields(self, df):
        """
        Adds all fields that have not been documented by the user
        to a category called "Uncategorized" and add the
        category to this dataset.

        Args:
            df    Pandas dataframe.
        """

        # create an uncategorized category
        uncategorized = Category("Uncategorized", "Autogenerated list of fields that have not been documented.")
        documentedFieldNames = self.getFieldNames()

        for fieldName in list(df.columns.values):
            if fieldName not in documentedFieldNames:
                # this field name is not documented, so
                # let's add it to the list of uncategorized
                # fields
                field = Field(fieldName)
                field.dataType = field.getDataType(df)

                # add the field to the category
                uncategorized.addField(field)

        # add the category
        self.addCategory(uncategorized)

    def printSelectAll(self, language = "R"):
        """
        Prints code in the specified language that selects all fields
        in this dataset.

        Args:
            language    String indicating the language to select all variables in.
        """
        code = ""
        if language.lower() == "r":

            # TODO: fix last comma issue

            code += "c(\n"

            for category in self.categories:
                code += "   # %s\n" % category.title

                for field in category.fields:
                    code += '   "%s",\n' % field.name
            code += ")"
        elif language.lower() == "python":

            code += "[\n"
            for category in self.categories:
                code += "   # %s\n" % category.title

                for field in category.fields:
                    code += '   "%s",\n' % field.name
            code += "]"

        return code

    def countFields(self):
        """
        Counts the number of fields in this dataset.
        """

        fieldCount = 0
        for category in self.categories:
            fieldCount += len(category.fields)
        return fieldCount


class Category():
    """
    A category holds any number of fields in a dataset
    """
    def __init__(self, title, description = None):
        """
        Args:
            title           String title
            description     String description
        """
        self.title = title
        self.description = description
        self.fields = []

    def addField(self, field):
        self.fields.append(field)

class Field():
    """
    A field in a dataset.
    """
    def __init__(self, name, description=None, private=False, transformed=False, percentNotNA=None):
        """
        Args:
            name            String field name in the dataset
            description     String field description
            private         Boolean indicating if a field is private or public
            transformed     Boolean indicating if a field is raw (False) or has gone
                            through some transformation process (True).
            percentNotNA    Numeric field indicating percentage of observations not NA
        """
        self.name = name
        self.description = description
        self.percentNotNA = percentNotNA
        self.dataType = None
        self.private = private
        self.transformed = transformed

    def getDataType(self, df):
        """
        Guesses the datatype of the field of a dataframe.
        Args:
            df              Pandas dataframe this field is in
        Return:
            Returns a string guessing the field's data type.
        """

        # the datatype map maps pandas data types to user friendly types
        dataTypeMap = {
            "object" : "Text",
            "int64" : "Numeric",
            "float64" : "Numeric",
            "bool" : "Boolean",
            "date" : "Date",
            "categorical" : "Categorical"
        }

        # set the data type
        dataType = dataTypeMap[str(df[self.name].dtype)]

        # look for special cases where we guess a different datatype
        if "date" in fieldName.lower():
            dataType = dataTypeMap["date"]
        # check if a text datatype is actually categorical
        elif dataType == "Text":
            # if there are fewer than k unique answers, then guess it's categorical
            numberOfUniqueAnswers = len(df[self.name].value_counts())
            if numberOfUniqueAnswers < 20: # TODO: This is kind of a hack, might think of a better solution
                  dataType = dataTypeMap["categorical"]

        return dataType

def generateSearch(datasets):
    """
    Generates a JSON file that allows users to search fields across datasets.

    Args:
        datasets    A list of dataset objects.
    Return:
        Returns a JSON file that is a list of dictionaries, where
        each dictionary defines a field.
    """

    search = []

    for dataset in datasets:
        categoryNumber = 1
        for category in dataset.categories:
            fieldNumber = 1
            for field in category.fields:
                search.append({
                    "field" : field.name,
                    "description": field.description,
                    "category" : category.title,
                    "dataset" : dataset.title,
                    "field_link" : "%s#field-%d-%d" % (dataset.getHtmlName(), categoryNumber, fieldNumber),
                    "category_link" : "%s#category-%d" % (dataset.getHtmlName(), categoryNumber),
                    "dataset_link" : "%s" % (dataset.getHtmlName())
                })
                fieldNumber += 1
            categoryNumber += 1

    # return as json
    return json.dumps(search)

if __name__ == "__main__":
    """
    Loop through every dataset in the datadocs yaml
    file.
    """

    # remove the /docs dir if it exists
    if os.path.exists("site"):
        shutil.rmtree('site')
    # if docs doesn't exist, which it shouldn't, make it again
    if not os.path.exists('site'):
        os.makedirs('site')

    # get the data docs settings
    datadocs = yaml.load(open("docs/datadocs.yaml", "r"), Loader=yaml.Loader)
    showUncategorized = datadocs['show_uncategorized']
    showPercentAnswered = datadocs['show_percent_answered']
    showPrivate = datadocs['show_private']

    # instantiate a list of datasets
    datasets = []
    for selectedDataset in datadocs['datasets']:
        # get the dataset name from the datadocs file
        datasetName = selectedDataset['name']

        # open the dataset yaml
        selectedDataset = yaml.load(open("docs/" + datasetName + ".yaml", "r"), Loader=yaml.Loader)

        datasetTitle = selectedDataset['title']
        datasetDescription = selectedDataset['description']

        # set the csv file name, which is just the
        # dataset name plus .csv
        datasetFileName = datasetName
        if ".csv" not in datasetFileName:
            datasetFileName += ".csv"

        # create a dataset object
        dataset = Dataset(datasetName, datasetTitle, datasetDescription)
        # read the data set as a csv and convert to a data frame
        df = pd.read_csv("docs/" + datasetFileName, sep=',', header=0, encoding='ISO-8859-1', index_col=None)

        for selectedCategory in selectedDataset['categories']:
            categoryTitle = selectedCategory['title']
            categoryDescription = None
            if 'description' in selectedCategory:
                categoryDescription = selectedCategory['description']
            # create a category object
            category = Category(categoryTitle, categoryDescription)
            if 'fields' in selectedCategory:
                for selectedField in selectedCategory['fields']:
                    fieldName = selectedField['name']
                    fieldDescription = selectedField['description']
                    fieldIsPrivate = False
                    if 'private' in selectedField:
                        fieldIsPrivate = selectedField['private']
                    fieldIsTransformed = False
                    if 'transformed' in selectedField:
                        fieldIsTransformed = selectedField['transformed']

                    # create a field object only if the field is not private or the
                    # field is private and the settings indiate we want to display
                    # private fields.
                    if showPrivate == True or (showPrivate == False and fieldIsPrivate == False):
                        field = Field(fieldName, description=fieldDescription, private=fieldIsPrivate, transformed=fieldIsTransformed)
                        if "type" not in selectedField:
                            # the user has not documented a datatype, so let's
                            # guess what the data type is.
                            field.dataType = field.getDataType(df)
                        else:
                            # Documentation includes a data type, so use that instead
                            field.dataType = selectedField['type']

                        # add this field to the category
                        category.addField(field)

            # add this category to the dataset
            dataset.addCategory(category)

        # add this dataset to the list of datasets
        datasets.append(dataset)

        # determine if we want to add uncategorized field
        if showUncategorized:
            dataset.addUncategorizedFields(df)

    # generate search index
    search = generateSearch(datasets)

    """
    Render templates
    """
    # jinja2 templating settings
    env = Environment(loader=PackageLoader('makedocs', 'templates'))

    # make index page
    template = env.get_template('home.html')
    file = open('site/index.html', 'w')

    # documentation properties
    docTitle = None
    docDescription = None
    showPercentAnswered = None
    if datadocs['title']:
        docTitle = datadocs['title']
    if datadocs['show_percent_answered']:
        showPercentAnswered = datadocs['show_percent_answered']

    # check if there is an index.md file in /docs. If there is
    # open it up, convert the markdown contents and pass it along as
    # content
    try:
        content = markdown.markdown(open('docs/index.md', 'r').read())
    except:
        content = None
    file.write(template.render(datasets=datasets, static="static", home="index.html", docTitle=docTitle,
        search=search, content=content))

    for dataset in datasets:
        template = env.get_template('dataset.html')
        file = open('site/%s' % (dataset.getHtmlName()), 'w')

        # check if there is an [file_name].md file in /docs. If there is
        # open it up, convert the markdown contents and pass it along as
        # content
        try:
            content = markdown.markdown(open('docs/' + dataset.name + '.md', 'r').read())
        except:
            content = None

        file.write(template.render(dataset=dataset, datasets=datasets, static="static", home="index.html",
            docTitle=docTitle, showPercentAnswered=showPercentAnswered,
            showUncategorized=showUncategorized, search=search, content=content))

    # copy static folder (css and images)
    shutil.copytree("static", "site/static")