-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2-Process-Data-Part2.py
168 lines (144 loc) · 6.93 KB
/
2-Process-Data-Part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 16 16:16:10 2018
@author: Will Kew
"""
import pandas as pd
#import seaborn as sns
#This creates a string Formula for us
def formulator(C,H,O):
if C == 0:
return None
formula = "C"+str(C)+"H"+str(H)
if O > 0:
formula = formula+"O"+str(O)
return formula
#This calculates the heteroatomic class
def hetclasser(C,H,O):
if C == 0:
return None
if O>0:
hetclass = "O"+str(O)
else:
hetclass = "CH"
return hetclass
# Aromaticity Index calculation. Unlike DBE, this factors in O and S. Taken from DOI: 10.1002/rcm.2386
# Added switch for using the "modified" aromaticity index instead of the normal one. This halves #O on basis/assumption only half will be pi-bound.
def AIcalc(C,H,N,O,S,P,mod=True):
if C == 0:
return None
if mod==True:
O = O/2
top = 1+C-O-S-(0.5*H)
btm = C-O-S-N-P
if btm == 0:
AI = 0
else:
AI = top/btm
if AI < 0:
AI = 0
return AI
# Calculates DBE
def DBEcalc(C,H,N,X=0):
if C == 0:
return None
DBE = (C+1-(H/2)-(X/2)+(N/2))
return DBE
#divides without a division by zero error
def dividor(X,Y):
if Y == 0:
return None
else:
return (X/Y)
#counts number of 0s within a row (of 4)
def countnonzeros(data):
if 0 in data.value_counts():
return 4 - data.value_counts()[0]
else:
return 4
path = "F:/Will/Dropbox/Documents/University/Edinburgh/FTICRMS/MixedIonisation3-formularity/"
#path = "C:/Users/Will/Dropbox/Documents/University/Edinburgh/FTICRMS/MixedIonisation3-formularity/"
inputdata = path+"Assignments/"
outputdata = path +"GroupedAssignments/"
file = "Assignments.csv"
polarity = "Neg"
ionisationsources = ["APCI","APPI","LDI","ESI"]
df = pd.read_csv(inputdata+file,index_col=0)
nsamples = len(df.T)-15 # how many samples are in this dataframe?
samplenames = df.columns[15:].values.tolist() # Get the file names from the headers of the dataframe
# This section computes statistics for assignments for each sample.
for source in ionisationsources:
samplestopdrop = [x for x in samplenames if not source in x]
samplename = [x for x in samplenames if source in x]
df_new = df.copy() #create a new dataframe copy of our data to modify
df_new.drop(samplestopdrop,axis=1,inplace=True) # remove other samples from this dataframe
#this sums the intensities of our ionisation sources - i.e. making one column to check if peaks found in these samples
df_new.loc[:,"SumInt"] = df_new.apply(lambda row: (row[-4:].sum()),axis=1)
df_new = df_new[df_new["SumInt"]!=0] #remove rows where peaks werent found in given samples.
df_new.loc[:,"TimesFound"] = df_new.apply(lambda row: countnonzeros(row[-4:]),axis=1)
#add formula and heteroatomic classes
df_new.loc[:,"Formula"] = df_new.apply(lambda row: formulator(row['C'],row['H'],row['O']),axis=1)
df_new.loc[:,"HeteroClass"] = df_new.apply(lambda row: hetclasser(row['C'],row['H'],row['O']),axis=1)
df_new.loc[:,"OC"] = df_new.apply(lambda row: dividor(row['O'],row['C']),axis=1)
df_new.loc[:,"HC"] = df_new.apply(lambda row: dividor(row['H'],row['C']),axis=1)
df_new.loc[:,"AImod"] = df_new.apply(lambda row: AIcalc(row['C'],row['H'],0,row['O'],0,0),axis=1) #CHNOSP and optional bool 'mod' switch
df_new.loc[:,"DBE"] = df_new.apply(lambda row: DBEcalc(row['C'],row['H'],0),axis=1) #CHN and optional int X switch.
writer = pd.ExcelWriter(outputdata+source+"-"+polarity+'.xlsx')
df_new.to_excel(writer,'All Peaks')
df_new_nohits = df_new[df_new['C']==0].copy() # This filters to leave only isotopic hits
df_new_iso = df_new[df_new['C13']!=0].copy() # This filters to leave only isotopic hits
df_new = df_new[(df_new['C']!=0) & (df['C13']==0)] # This filters to leave only monoisotopic hits
df_new.to_excel(writer,'Hits')
df_new_iso.to_excel(writer,'Isotopic Hits')
df_new_nohits.to_excel(writer,'Unassigned')
writer.save()
"""
#Generate a new dataframe for our statistics output
statscols = ["Polarity","Ionisation Source", "Sample","Total Peaks","Monoisotopic Hits","Isotopic Hits",
"Unassigned","Percent Assigned","Percent Monoisotopic Assigned","Mean Error (ppm)","Std Error",
"Deprotonated","Radicals","Monoisotopic Deprotonated","Monoisotopic Radicals"]
df_statistics = pd.DataFrame(index=samplenames,columns=statscols)
for sample in samplenames:
samplestopdrop = [x for x in samplenames if not sample in x]
#samplename = [x for x in samplenames if source in x]
df_new = df.copy() #create a new dataframe copy of our data to modify
df_new.drop(samplestopdrop,axis=1,inplace=True) # remove other samples from this dataframe
df_new = df_new[df_new[sample]!=0]
df_statistics.loc[sample,"Polarity"] = polarity
df_statistics.loc[sample,"Ionisation Source"] = sample[:-13]
df_statistics.loc[sample,"Sample"] = sample[-8:]
df_statistics.loc[sample,"Total Peaks"] = len(df_new)
df_statistics.loc[sample,"Monoisotopic Hits"] = len(df_new[(df_new['Class']!="None") & (df_new['C13']==0)])
df_statistics.loc[sample,"Isotopic Hits"] = len(df_new[(df_new['Class']!="None") & (df_new['C13']!=0)])
df_statistics.loc[sample,"Unassigned"] = len(df_new[(df_new['Class']=="None")])
df_statistics.loc[sample,"Percent Assigned"] = (len(df_new)-len(df_new[(df_new['Class']=="None")]))/ len(df_new)
df_statistics.loc[sample,"Percent Monoisotopic Assigned"] = len(df_new[(df_new['Class']!="None") & (df_new['C13']==0)]) / len(df_new)
df_statistics.loc[sample,"Mean Error (ppm)"] = df_new[(df_new['Class']!="None")]['Error_ppm'].abs().mean()
df_statistics.loc[sample,"Std Error"] = df_new[(df_new['Class']!="None")]['Error_ppm'].abs().std()
df_statistics.loc[samplename,"Deprotonated"] = len(df_new[(df_new['State']=="H-")])
df_statistics.loc[samplename,"Radicals"] = len(df_new[(df_new['State']==".-")])
df_statistics.loc[samplename,"Monoisotopic Deprotonated"] = len(df_new[(df_new['State']=="H-") & (df_new['C13']==0)])
df_statistics.loc[samplename,"Monoisotopic Radicals"] = len(df_new[(df_new['State']==".-") & (df_new['C13']==0)])
writer = pd.ExcelWriter(path+"Hit Statistics/"+"HitStatistics.xlsx")
df_statistics.to_excel(writer,"Statistics")
# Get access to the workbook and sheet
workbook = writer.book
worksheet = writer.sheets['Statistics']
# Add a percent format with 1 decimal point
percent_fmt = workbook.add_format({'num_format': '0.0%', 'bold': False})
float_fmt = workbook.add_format({'num_format': '0.000', 'bold': False})
# Quota percent columns
worksheet.set_column('I:I', 16, percent_fmt)
worksheet.set_column('J:J', 29, percent_fmt)
#fix column widths
worksheet.set_column('A:A', 20)
worksheet.set_column('C:C', 17)
worksheet.set_column('E:E', 10)
worksheet.set_column('F:F', 17)
worksheet.set_column('G:G', 12)
worksheet.set_column('H:H', 10)
worksheet.set_column('K:K', 16,float_fmt)
worksheet.set_column('L:L', 10,float_fmt)
writer.save()
"""