-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_imaging.py
74 lines (59 loc) · 2.77 KB
/
preprocess_imaging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import re
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
def preprocess_imaging(df):
# ct_vocab = '13gram_24surg_27nonsurg'
# sboimg_vocab = 'sboimg_23gram_min12_43surg_52nonsurg'
ct_vocab = 'ct_hand'
sboimg_vocab = 'sboimg_hand'
ct = pd.read_pickle('imaging/ct_full.pickle')
sboimg = pd.read_pickle('imaging/sboimg_full.pickle')
ct_text = preprocess_imaging_helper(ct, df, ct_vocab, 'ct_img')
sboimg_text = preprocess_imaging_helper(sboimg, df, sboimg_vocab, 'sbo_img')
res = pd.concat([df,
#ct_text[['datetime','word_log_ratio_img', 'indicator_img']]]
ct_text.drop(['text_len'],1),
sboimg_text.drop(['text_len'],1)]
, sort=True)
return res
def preprocess_imaging_helper(ct, df, vocab, suff):
# NOTE: not just for ct but didn't want to have to change the variables
print('\nVocabulary: {}'.format(vocab))
vocabulary_dict = pd.read_pickle(
'vocabularies/{}.pickle'.format(vocab))
surg_words = list(vocabulary_dict['surg'])
non_surg_words = list(vocabulary_dict['non_surg'])
# Only imaging for people in df
ct = (ct.reset_index(level=0)
.loc[df.index.get_level_values(1).unique(),:]
.dropna()
.reset_index().set_index(['mrn','id', 'datetime']))
# Count occurrences using vocabulary
count_vectorizer = CountVectorizer(
vocabulary = surg_words + non_surg_words,
binary=True)
count_array = count_vectorizer.fit_transform(ct['full_text'].values)
count_df = pd.DataFrame(count_array.todense(),
columns=count_vectorizer.get_feature_names(),
index=ct.index).add_suffix('_{}'.format(suff))
surg_words = [col+'_{}'.format(suff) for col in surg_words]
non_surg_words = [col+'_{}'.format(suff) for col in non_surg_words]
ct_text = pd.concat([ct[['text_len']], count_df],1, sort=True).reset_index(level=2)
surg_words_count = ct_text[surg_words].sum(axis=1)
non_surg_words_count = ct_text[non_surg_words].sum(axis=1)
surg_words_count_adj = surg_words_count / ct_text['text_len']
non_surg_words_count_adj = non_surg_words_count / ct_text['text_len']
#ct_enc_text['surg_words'] = np.log(ct_enc_text['surg_words_count_adj']+1) > 1
#ct_enc_text['non_surg_words'] = np.log(ct_enc_text['non_surg_words_count_adj']+1) > 0.5
#ct_enc_text['word_diff'] = (
# ct_enc_text['surg_words_count_adj'] - ct_enc_text['non_surg_words_count_adj']/2)
#ct_text['word_log_ratio_{}'.format(suff)] = (
# np.log(
# (surg_words_count_adj + 1)
# / (non_surg_words_count_adj + 1)))
ct_text['ind_event_{}'.format(suff)] = 1
return ct_text
if __name__ == '__main__':
preprocess_imaging(ct)