-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmakedocs.py
383 lines (322 loc) · 13.2 KB
/
makedocs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import pandas as pd
import os, shutil
from jinja2 import Environment, PackageLoader
import yaml
import os.path
import json
import markdown
class FieldError(Exception):
"""
Class to inform the user of a field error in her or
his yaml configuration.
"""
def __init__(self, dataset, field):
self.dataset = dataset
self.field = field
self.value = "%s field in the %s dataset (is the %s field missing?)" % (self.field, self.dataset, self.field)
def __str__(self):
return repr(self.field)
class Dataset():
"""
A dataset is a csv file that is either in the root /data directory or housed in
a DataFolder anywhere in /data.
Args:
name String full name of this dataset (such as "test.csv")
title String human readable title
description String description
"""
def __init__(self, name, title = None, description = None):
self.name = name
self.title = title
self.description = description
self.categories = []
def addCategory(self, category):
"""
Add a category object to the category list.
Args:
category Category object
"""
self.categories.append(category)
def getFieldNames(self):
"""
Returns a list of field names in this dataset
"""
fieldNames = []
for category in self.categories:
for field in category.fields:
fieldNames.append(field.name)
return fieldNames
def getHtmlName(self, appendText=None):
"""
Returns a string takign the file name and turning it into a
reasonable html file name that strips white space and .csv
Args:
appendText String optional text to append to the end of the
html file name.
"""
htmlName = self.name.replace(' ', '_')
htmlName = htmlName.replace('.csv', '')
if appendText:
htmlName += "_" + appendText
htmlName += '.html'
return htmlName
def addUncategorizedFields(self, df):
"""
Adds all fields that have not been documented by the user
to a category called "Uncategorized" and add the
category to this dataset.
Args:
df Pandas dataframe.
"""
# create an uncategorized category
uncategorized = Category("Uncategorized", "Autogenerated list of fields that have not been documented.")
documentedFieldNames = self.getFieldNames()
for fieldName in list(df.columns.values):
if fieldName not in documentedFieldNames:
# this field name is not documented, so
# let's add it to the list of uncategorized
# fields
field = Field(fieldName)
field.dataType = field.getDataType(df)
# add the field to the category
uncategorized.addField(field)
# add the category
self.addCategory(uncategorized)
def printSelectAll(self, language = "R"):
"""
Prints code in the specified language that selects all fields
in this dataset.
Args:
language String indicating the language to select all variables in.
"""
code = ""
if language.lower() == "r":
# TODO: fix last comma issue
code += "c(\n"
for category in self.categories:
code += " # %s\n" % category.title
for field in category.fields:
code += ' "%s",\n' % field.name
code += ")"
elif language.lower() == "python":
code += "[\n"
for category in self.categories:
code += " # %s\n" % category.title
for field in category.fields:
code += ' "%s",\n' % field.name
code += "]"
return code
def countFields(self):
"""
Counts the number of fields in this dataset.
"""
fieldCount = 0
for category in self.categories:
fieldCount += len(category.fields)
return fieldCount
class Category():
"""
A category holds any number of fields in a dataset
"""
def __init__(self, title, description = None):
"""
Args:
title String title
description String description
"""
self.title = title
self.description = description
self.fields = []
def addField(self, field):
self.fields.append(field)
class Field():
"""
A field in a dataset.
"""
def __init__(self, name, description=None, private=False, transformed=False, percentNotNA=None):
"""
Args:
name String field name in the dataset
description String field description
private Boolean indicating if a field is private or public
transformed Boolean indicating if a field is raw (False) or has gone
through some transformation process (True).
percentNotNA Numeric field indicating percentage of observations not NA
"""
self.name = name
self.description = description
self.percentNotNA = percentNotNA
self.dataType = None
self.private = private
self.transformed = transformed
def getDataType(self, df):
"""
Guesses the datatype of the field of a dataframe.
Args:
df Pandas dataframe this field is in
Return:
Returns a string guessing the field's data type.
"""
# the datatype map maps pandas data types to user friendly types
dataTypeMap = {
"object" : "Text",
"int64" : "Numeric",
"float64" : "Numeric",
"bool" : "Boolean",
"date" : "Date",
"categorical" : "Categorical"
}
# set the data type
dataType = dataTypeMap[str(df[self.name].dtype)]
# look for special cases where we guess a different datatype
if "date" in fieldName.lower():
dataType = dataTypeMap["date"]
# check if a text datatype is actually categorical
elif dataType == "Text":
# if there are fewer than k unique answers, then guess it's categorical
numberOfUniqueAnswers = len(df[self.name].value_counts())
if numberOfUniqueAnswers < 20: # TODO: This is kind of a hack, might think of a better solution
dataType = dataTypeMap["categorical"]
return dataType
def generateSearch(datasets):
"""
Generates a JSON file that allows users to search fields across datasets.
Args:
datasets A list of dataset objects.
Return:
Returns a JSON file that is a list of dictionaries, where
each dictionary defines a field.
"""
search = []
for dataset in datasets:
categoryNumber = 1
for category in dataset.categories:
fieldNumber = 1
for field in category.fields:
search.append({
"field" : field.name,
"description": field.description,
"category" : category.title,
"dataset" : dataset.title,
"field_link" : "%s#field-%d-%d" % (dataset.getHtmlName(), categoryNumber, fieldNumber),
"category_link" : "%s#category-%d" % (dataset.getHtmlName(), categoryNumber),
"dataset_link" : "%s" % (dataset.getHtmlName())
})
fieldNumber += 1
categoryNumber += 1
# return as json
return json.dumps(search)
if __name__ == "__main__":
"""
Loop through every dataset in the datadocs yaml
file.
"""
# remove the /docs dir if it exists
if os.path.exists("site"):
shutil.rmtree('site')
# if docs doesn't exist, which it shouldn't, make it again
if not os.path.exists('site'):
os.makedirs('site')
# get the data docs settings
datadocs = yaml.load(open("docs/datadocs.yaml", "r"), Loader=yaml.Loader)
showUncategorized = datadocs['show_uncategorized']
showPercentAnswered = datadocs['show_percent_answered']
showPrivate = datadocs['show_private']
# instantiate a list of datasets
datasets = []
for selectedDataset in datadocs['datasets']:
# get the dataset name from the datadocs file
datasetName = selectedDataset['name']
# open the dataset yaml
selectedDataset = yaml.load(open("docs/" + datasetName + ".yaml", "r"), Loader=yaml.Loader)
datasetTitle = selectedDataset['title']
datasetDescription = selectedDataset['description']
# set the csv file name, which is just the
# dataset name plus .csv
datasetFileName = datasetName
if ".csv" not in datasetFileName:
datasetFileName += ".csv"
# create a dataset object
dataset = Dataset(datasetName, datasetTitle, datasetDescription)
# read the data set as a csv and convert to a data frame
df = pd.read_csv("docs/" + datasetFileName, sep=',', header=0, encoding='ISO-8859-1', index_col=None)
for selectedCategory in selectedDataset['categories']:
categoryTitle = selectedCategory['title']
categoryDescription = None
if 'description' in selectedCategory:
categoryDescription = selectedCategory['description']
# create a category object
category = Category(categoryTitle, categoryDescription)
if 'fields' in selectedCategory:
for selectedField in selectedCategory['fields']:
fieldName = selectedField['name']
fieldDescription = selectedField['description']
fieldIsPrivate = False
if 'private' in selectedField:
fieldIsPrivate = selectedField['private']
fieldIsTransformed = False
if 'transformed' in selectedField:
fieldIsTransformed = selectedField['transformed']
# create a field object only if the field is not private or the
# field is private and the settings indiate we want to display
# private fields.
if showPrivate == True or (showPrivate == False and fieldIsPrivate == False):
field = Field(fieldName, description=fieldDescription, private=fieldIsPrivate, transformed=fieldIsTransformed)
if "type" not in selectedField:
# the user has not documented a datatype, so let's
# guess what the data type is.
field.dataType = field.getDataType(df)
else:
# Documentation includes a data type, so use that instead
field.dataType = selectedField['type']
# add this field to the category
category.addField(field)
# add this category to the dataset
dataset.addCategory(category)
# add this dataset to the list of datasets
datasets.append(dataset)
# determine if we want to add uncategorized field
if showUncategorized:
dataset.addUncategorizedFields(df)
# generate search index
search = generateSearch(datasets)
"""
Render templates
"""
# jinja2 templating settings
env = Environment(loader=PackageLoader('makedocs', 'templates'))
# make index page
template = env.get_template('home.html')
file = open('site/index.html', 'w')
# documentation properties
docTitle = None
docDescription = None
showPercentAnswered = None
if datadocs['title']:
docTitle = datadocs['title']
if datadocs['show_percent_answered']:
showPercentAnswered = datadocs['show_percent_answered']
# check if there is an index.md file in /docs. If there is
# open it up, convert the markdown contents and pass it along as
# content
try:
content = markdown.markdown(open('docs/index.md', 'r').read())
except:
content = None
file.write(template.render(datasets=datasets, static="static", home="index.html", docTitle=docTitle,
search=search, content=content))
for dataset in datasets:
template = env.get_template('dataset.html')
file = open('site/%s' % (dataset.getHtmlName()), 'w')
# check if there is an [file_name].md file in /docs. If there is
# open it up, convert the markdown contents and pass it along as
# content
try:
content = markdown.markdown(open('docs/' + dataset.name + '.md', 'r').read())
except:
content = None
file.write(template.render(dataset=dataset, datasets=datasets, static="static", home="index.html",
docTitle=docTitle, showPercentAnswered=showPercentAnswered,
showUncategorized=showUncategorized, search=search, content=content))
# copy static folder (css and images)
shutil.copytree("static", "site/static")