-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathnsfg2.py
82 lines (56 loc) · 2.1 KB
/
nsfg2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
import numpy as np
import thinkstats2
def MakeFrames():
"""Reads pregnancy data and partitions first babies and others.
returns: DataFrames (all live births, first babies, others)
"""
preg = ReadFemPreg()
live = preg[preg.outcome == 1]
firsts = live[live.birthord == 1]
others = live[live.birthord != 1]
assert(len(live) == 14292)
assert(len(firsts) == 6683)
assert(len(others) == 7609)
return live, firsts, others
def ReadFemPreg(dct_file='2006_2010_FemPregSetup.dct',
dat_file='2006_2010_FemPreg.dat.gz'):
"""Reads the NSFG 2006-2010 pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file, encoding='iso-8859-1')
df = dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df
def CleanFemPreg(df):
"""Recodes variables from the pregnancy frame.
df: DataFrame
"""
# mother's age is encoded in centiyears; convert to years
df.agepreg /= 100.0
# birthwgt_lb contains at least one bogus value (51 lbs)
# replace with NaN
df.birthwgt_lb1[df.birthwgt_lb1 > 20] = np.nan
# replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb1.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz1.replace(na_vals, np.nan, inplace=True)
# birthweight is stored in two columns, lbs and oz.
# convert to a single column in lb
# NOTE: creating a new column requires dictionary syntax,
# not attribute assignment (like df.totalwgt_lb)
df['totalwgt_lb'] = df.birthwgt_lb1 + df.birthwgt_oz1 / 16.0
# due to a bug in ReadStataDct, the last variable gets clipped;
# so for now set it to NaN
df.phase = np.nan
def main():
live, firsts, others = MakeFrames()
if __name__ == '__main__':
main()