...
 
Commits (55)
......@@ -2,11 +2,33 @@ import logging, os
import numpy as np
import pandas as pd
log = logging.getLogger(os.path.basename(__file__))
from pyteomics import mzml, auxiliary # TODO use pymzml with python 3.x
from pyteomics import mzml, auxiliary
import pymzml
import pymzml
def mzML2DataFrames_faster(filename): #this is with pytheomics, TODO try mobiusklein/ms_deisotope
idx_list = [] #TODO this should be a np.array
fs_list = []
i_list = []
m_list = []
msrun = pymzml.run.Reader(file_path)
for spectrum in msrun:
idx = spectrum['id']
fs = spectrum['MS:1000512']
i = spectrum.mz
m = spectrum.i
#Append to content
count = len(i)
idx_list.extend([idx]*count)
fs_list.extend([fs]*count)
i_list.extend(i)
m_list.extend(m)
return pd.DataFrame({'scanNum':idx_list, 'filterLine':fs_list, 'intensity':i_list, 'mass':m_list})
def mzML2DataFrames(filename):
def mzML2DataFrames(filename): #this is with pytheomics
idx_list = [] #TODO this should be a np.array
fs_list = []
i_list = []
......
import unittest
class TestStringMethods(unittest.TestCase):
def test_upper(self):
self.assertEqual('foo'.upper(), 'FOO')
def test_isupper(self):
self.assertTrue('FOO'.isupper())
self.assertFalse('Foo'.isupper())
def test_split(self):
s = 'hello world'
self.assertEqual(s.split(), ['hello', 'world'])
# check that s.split fails when the separator is not a string
with self.assertRaises(TypeError):
s.split(2)
if __name__ == '__main__':
print(f'number {number} is {lettter}')
unittest.main()
\ No newline at end of file
......@@ -146,7 +146,7 @@ def add_Sample(
# get MS2 scans
get_ms2Scans = [(t, mz, sn, sm, pol, tic, np, bp) for t, mz, sn, st, sm, pol, tic, np, bp \
in mz_file.scan_info(t1, t2, msm1, msm2) if st == 'MS2']
in mz_file.scan_info(t1, t2, msm1, msm2) if st == 'MS2'] #TODO this is bug, scan_info(t1, t2, msmsm1, msmsm2)
ms2Scans = []
for t, precursor, sn, sm, pol, tic, np, bp in get_ms2Scans:
......
from pyteomics import mzml, auxiliary
import pandas as pd
import logging
import os
log = logging.getLogger(os.path.basename(__file__))
from lx.alignment import specEntry, linearAlignment
def add_Sample(
sc=None,
specFile=None,
specDir=None,
options={},
**kwargs
):
make_a_masterscan(specFile, options)
specName = specFile
lpdxSample_base_peak_ms1 = None
nb_ms_scans = 0
nb_ms_peaks = 0
nb_msms_scans = 0
nb_msms_peaks = 0
return (specName, lpdxSample_base_peak_ms1, nb_ms_scans, nb_ms_peaks, nb_msms_scans, nb_msms_peaks)
def make_a_masterscan(mzml_file, options):
scansDF, peaksDF = mzML2DataFrames(mzml_file)
# as in readspectra, filter by mass and time
no_result = (0, float('inf'))
MS2massrange = options.get('MSMSmassrange', no_result )
timerange = options.get('timerange', no_result) #this is in seconds we need minutes
MS1massrange = options.get('MSmassrange', no_result)
scans_pos = scansDF.loc[scansDF.positive_scan == True]
scans_neg = scansDF.loc[scansDF.positive_scan == False]
scans_ms1 = scansDF.loc[scansDF.msLevel == 1]
scans_ms2 = scansDF.loc[scansDF.msLevel == 2]
scans_timerange = scansDF.loc[scansDF.time.multiply(60).between(*timerange)]
peaks_MS1massrange = peaksDF.loc[peaksDF.m.between(*MS1massrange)]
peaks_MS2massrange = peaksDF.loc[peaksDF.m.between(*MS2massrange)]
# as in https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge
ms1_pos_scans = scansDF.reset_index().merge(scans_ms1).merge(scans_pos).merge(scans_timerange).set_index('id')
spectraDF = peaks_MS1massrange.merge(ms1_pos_scans.max_i, left_index=True, right_index=True) #left_ and right_ index to keep the index
spectraDF['rel_i'] = spectraDF.i / spectraDF.max_i
#make the ms1Scans as in old version, TODO avoid doing this
ms1Scans = []
for scan_tuple in ms1_pos_scans.itertuples():
scan_peaks = spectraDF.loc[scan_tuple.Index]
scan_processed=[(peak_tuple.m, peak_tuple.i, peak_tuple.rel_i, 0,0,0,0) for peak_tuple in scan_peaks.itertuples()] #padding zeroes because this is the expected format
ms1Scans.append({
'time' : scan_tuple.time,
'totIonCurrent' : scan_tuple.tic,
'polarity' : '+' if scan_tuple.positive_scan else '-',
'max_it' : scan_tuple.max_i,
'scan' : scan_processed})
#--------------- now for the ms2
# as in https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge
ms2_pos_scans = scansDF.reset_index().merge(scans_ms2).merge(scans_pos).merge(scans_timerange).set_index('id')
spectra2DF = peaks_MS2massrange.merge(ms2_pos_scans.max_i, left_index=True, right_index=True) #left_ and right_ index to keep the index
spectra2DF['rel_i'] = spectra2DF.i / spectra2DF.max_i
#make the ms2Scans as in old version, TODO avoid doing this
ms2Scans = [] #TODO this is different then the og, not clear why
for scan_tuple in ms2_pos_scans.loc[spectra2DF.index.unique()].itertuples(): #some scans dont have peaks
scan_peaks = spectra2DF.loc[scan_tuple.Index]
if type(scan_peaks) is pd.Series: #only One entry
scan_processed=[(scan_peaks['m'], scan_peaks['i'], scan_peaks['rel_i'], 0,0,0,0)]
else:
scan_processed=[(peak_tuple.m, peak_tuple.i, peak_tuple.rel_i, 0,0,0,0) for peak_tuple in scan_peaks.itertuples()] #padding zeroes because this is the expected format
ms2Scans.append({
'time' : scan_tuple.time,
'totIonCurrent' : scan_tuple.tic,
'precursorMz' : scan_tuple.target_mz, #TODO replace this with the trigger scan
'polarity' : '+' if scan_tuple.positive_scan else '-',
'max_it' : scan_tuple.max_i,
'scan' : scan_processed})
#--------------------------make dict spec entry for use for linear alignment
dictSpecEntry = {}
#TODO use this specEntry = namedtuple('specEntry', 'mass content')
specDF = spectraDF.loc[(spectraDF.i>0) & (spectraDF.rel_i>0)] #not sure why this is needed
for id, group in ms1_pos_scans.groupby('id'): # see definition above
scan_peaks = specDF.loc[id]
#specEntry(mass, content={})
dictSpecEntry[id] = [specEntry(peak.m,{'sample':id, 'intensity' : peak.i, 'intensity_rel': peak.rel_i}) for peak in scan_peaks.itertuples()]
#-----------------------TODO refactor linearAlignment
fadi_denominator = count
fadi_percentage = options['MSfilter']
listClusters = linearAlignment(list(dictSpecEntry.keys()),
dictSpecEntry,
options['MSresolution'],
merge = mergeSumIntensity,
mergeTolerance = options['MSresolution'],
mergeDeltaRes = options['MSresolutionDelta'], fadi_denominator= fadi_denominator, fadi_percentage = fadi_percentage)
def mzML2DataFrames(filename): #TODO move this to input2dataframe
scans = []
peaks_dfs = []
with mzml.read(filename) as reader:
for item in reader:
id = item['id']
idx = item['index'] + 1
fs = item['scanList']['scan'][0]['filter string']
time = item['scanList']['scan'][0]['scan start time'] # * 1 to make a unitfloat into a float
msLevel = item['ms level']
positive_scan = True if 'positive scan' in item else False
if not positive_scan: item['negative scan'] # raise exceltion if not positive or negative
p_data = item.get('precursorList',None) #helper
precursor_id = p_data['precursor'][0]['spectrumRef'] if p_data else None
max_i = item['base peak intensity']
tic = item['total ion current']
# collect the scans data
row = (id,idx,fs,time,msLevel,positive_scan,precursor_id)
scans.append(row)
# collect the peaks data
i = item['intensity array']
m = item['m/z array']
cols = {'m':m, 'i':i}
df = pd.DataFrame(cols)
df['id']=id
df.set_index('id', inplace = True)
peaks_dfs.append(df)
scansDF = pd.DataFrame(scans, columns=['id','idx','filter_string','time','msLevel','positive_scan','precursor_id', 'max_i', 'tic'])
scansDF.set_index('id', inplace = True)
peaksDF = pd.concat(peaks_dfs)
return scansDF, peaksDF
......@@ -20,6 +20,9 @@ from lx.alignment import alignPIS, mkSurveyLinear, mkSurveyHierarchical, \
from lx.debugger import Debug
from lx.readSpectra_refactor import add_Sample as add_Sample_ref
# for debugging
#from guppy import hpy
......@@ -455,6 +458,18 @@ def doImport(options, scan, importDir, output, parent, listFiles, isTaken, isGro
MSMSthresholdType = scan.options['MSMSthresholdType'])
elif options['spectraFormat'] == 'mzML': # the new import routine, :-)
# ret_ref = add_Sample_ref(scan, i[0], i[1],
# options = scan.options,
# timerange = scan.options['timerange'],
# MSmassrange = scan.options['MSmassrange'],
# MSMSmassrange = scan.options['MSMSmassrange'],
# scanAveraging = scanAvg,
# isGroup = isGroup,
# importMSMS = importMSMS,
# MSthresholdType = scan.options['MSthresholdType'],
# MSMSthresholdType = scan.options['MSMSthresholdType'],
# fileformat = "mzML")
ret = add_Sample(scan, i[0], i[1],
options = scan.options,
timerange = scan.options['timerange'],
......@@ -466,6 +481,7 @@ def doImport(options, scan, importDir, output, parent, listFiles, isTaken, isGro
MSthresholdType = scan.options['MSthresholdType'],
MSMSthresholdType = scan.options['MSMSthresholdType'],
fileformat = "mzML")
print(ret[0])
#elif options['spectraFormat'] == 'raw':
#ret = add_Sample(scan, i[0], i[1],
......
This diff is collapsed.
'''
tool to handle the spectra
'''
import pandas as pd
import numpy as np
from collections import namedtuple
import logging, os
log = logging.getLogger(os.path.basename(__file__))
from pyteomics import mzml, auxiliary
def mzML2DataFrames(filename, test_sample = False): #this is with pytheomics
scans = []
peaks_dfs = []
with mzml.read(filename) as reader:
for item in reader:
id = item['id']
idx = item['index']
fs = item['scanList']['scan'][0]['filter string']
if item['scanList']['count'] != 1 : raise NotImplementedError('we dont read more than one scan per entry .... yet ')
time = item['scanList']['scan'][0]['scan start time'] # * 1 the dataframe makes a unitfloat into a float
msLevel = item['ms level']
positive_scan = True if 'positive scan' in item else False
if not positive_scan: item['negative scan'] # raise exceltion if not positive or negative
p_data = item.get('precursorList',None) #helper
if p_data and p_data['count'] != 1 : raise NotImplementedError('we dont read more than one scan per entry .... yet ')
precursor_id = p_data['precursor'][0]['spectrumRef'] if p_data else None#check if more than one
target_mz = p_data['precursor'][0]['isolationWindow']['isolation window target m/z'] if p_data else None
max_i = item['base peak intensity']
tic = item['total ion current']
#collect the scans data
row = (id,idx,fs,time,msLevel,positive_scan,precursor_id,max_i, tic, target_mz)
scans.append(row)
#collect the peaks data
i = item['intensity array']
m = item['m/z array']
cols = {'m':m, 'i':i}
df = pd.DataFrame(cols)
df['id']=id
df.set_index('id', inplace = True)
peaks_dfs.append(df)
#for testing
if test_sample and len(scans) >100: #TODO remove this
break
scansDF = pd.DataFrame(scans, columns=['id','idx','filter_string','time','msLevel','positive_scan','precursor_id', 'max_i', 'tic','target_mz'])
scansDF.set_index('id', inplace = True)
peaksDF = pd.concat(peaks_dfs)
return scansDF, peaksDF
class SpectraUtil:
'Util to handle spectra'
def __init__(self, scansDF, peaksDF, filename = None):
self._original_scansDF = scansDF
self._original__peaksDF = peaksDF
self.scansDF = self._original_scansDF
self.peaksDF = self._original__peaksDF
self._filename = filename
@staticmethod
def fromFile(filename, test_sample=False):
return SpectraUtil(*mzML2DataFrames(filename, test_sample), filename)
#note to help debug maybe use
# @property
# def scansDF(self):
# return self.scansDF
def reset(self):
print(f'reseting to original')
self.scansDF = self._original_scansDF
self.peaksDF = self._original__peaksDF
def get_reset_copy(self):
print(f'a copy of the original with nothing set... sorry no undo')
return SpectraUtil(self._original_scansDF, self._original__peaksDF, self._filename)
def get_current_copy(self):
print(f'a copy of the current set... just in case')
return SpectraUtil( self.scansDF, self.peaksDF, self._filename)
def set_timerange(self,t0,t1):
print(f'time range in seconds: {t0} to {t1}')
self.scansDF = self.scansDF.loc[self.scansDF.time.multiply(60).between(t0,t1)]
def set_mode(self,positive_mode=True):
print(f'set mode to positive : {positive_mode}, (false means negative) ')
self.scansDF = self.scansDF.loc[self.scansDF.positive_scan == positive_mode]
def set_ms_level(self,level=1):
print(f'set ms level to : {level}')
self.scansDF = self.scansDF.loc[self.scansDF.msLevel == level]
def set_mass_range(self,m0,m1):
print(f'time mass range from: {m0} to {m1}')
self.peaksDF = self.peaksDF.loc[self.peaksDF.m.between(m0,m1)]
def make_rel_i(self):
print(f'calculate the relative intensities as: rel_i')
#left_ and right_ index to keep the index
spectraDF = self.peaksDF.merge(self.scansDF.max_i, left_index=True, right_index=True)
self.peaksDF['rel_i'] = spectraDF.i / spectraDF.max_i
def set_min_i(self, min_i = 0):
print(f'set the minimum intensity to {min_i}')
self.peaksDF = self.peaksDF.loc[self.peaksDF.i > min_i]
def round_m(self, decimals=4):
print(f'set the precision of m/z to {decimals} decimal places')
self.peaksDF['m'] = self.peaksDF.m.round(decimals)
def get_fragment_scans(self, scan_index):
print(f'scans triggered by the {scan_index}')
return self._original_scansDF.loc[self._original_scansDF.precursor_id == scan_index]
def get_fragment_peaks(self, scan_index):
print(f'Peaks triggered by the {scan_index} ')
# for scantuple in spectraUtil.scansDF.itertuples():
# dosomething(spectraUtil.get_fragment_peaks(scantuple.Index))
return self._original__peaksDF.loc[self._original__peaksDF.index.isin(self.get_fragment_scans(scan_index).index)]
def get_nearest(self, targetsDF, peaksDF = None, on = 'm', tol=0.01):
print(f'find the nearest Peaks to the target_peaks with a tolerance of {tol}')
if peaksDF == None:
peaksDF = self.peaksDF
s_peaksDF = peaksDF.reset_index().sort_values(on)
s_targetDF = targetsDF.reset_index().sort_values(on)
tmp_type = peaksDF[on].dtype
s_targetDF[on] = s_targetDF[on].astype(tmp_type)
s_targetDF['target'] = s_targetDF[on]
return pd.merge_asof(s_peaksDF, s_targetDF, left_on=on, right_on=on ,tolerance=tol ,direction='nearest').dropna()
import unittest
from LX2_MS_reader import SpectraUtil
class TestLX2_MS_reader(unittest.TestCase):
def setUp(self):
filename = 'test_resources\\small_test\\190321_Serum_Lipidextract_368723_01.mzML'
self.spectraUtil = SpectraUtil.fromFile(filename, test_sample=True)
def test_read(self):
scan_cols = ['idx', 'filter_string', 'time', 'msLevel', 'positive_scan',
'precursor_id', 'max_i', 'tic', 'target_mz']
peak_cols = ['m', 'i']
self.assertListEqual(self.spectraUtil.scansDF.columns.tolist(), scan_cols)
self.assertListEqual(self.spectraUtil.peaksDF.columns.tolist(), peak_cols)
if __name__ == '__main__':
unittest.main()
# from http://www.dalkescientific.com/writings/NBN/parsing_with_ply.html
# A grammar for chemical equations like "H2O", "CH3COOH" and "H2SO4"
# Uses David Beazley's PLY parser.
# Implements two functions: count the total number of atoms in the equation and
# count the number of times each element occurs in the equation.
import ply.lex as lex
import ply.yacc as yacc
tokens = (
"SYMBOL",
"NUM",
"DOT",
'LPAREN',
'RPAREN'
)
t_SYMBOL = (
r"C[laroudsemf]?|Os?|N[eaibdpos]?|S[icernbmg]?|P[drmtboau]?|"
r"H[eofgas]?|A[lrsgutcm]|B[eraik]?|Dy|E[urs]|F[erm]?|G[aed]|"
r"I[nr]?|Kr?|L[iaur]|M[gnodt]|R[buhenaf]|T[icebmalh]|"
r"U|V|W|Xe|Yb?|Z[nr]")
t_DOT = r'\.'
t_LPAREN = r'\['
t_RPAREN = r'\]'
def t_NUM(t):
r"\d+"
t.value = int(t.value)
return t
t_ignore = ' \t'
def t_error(t):
raise TypeError("Unknown text '%s'" % (t.value,))
lexer1 = lex.lex()
def p_chemical_equation(p):
"""
chemical_equation :
chemical_equation : species_list
"""
if len(p) == 1:
# the empty string means there are no atomic symbols
p[0] = {}
else:
p[0] = p[1]
def p_species_list(p):
"species_list : species_list species"
p[1].update(p[2])
p[0] = p[1]
def p_species(p):
"species_list : species"
p[0] = p[1]
def p_single_species(p):
"""
species : SYMBOL
species : SYMBOL NUM
species : SYMBOL LPAREN NUM RPAREN
species : SYMBOL LPAREN NUM DOT DOT NUM RPAREN
"""
if len(p) == 2:
p[0] = {p[1]: (1,1)}
elif len(p) == 3:
p[0] = {p[1]: (p[2],p[2])}
elif len(p) == 5:
p[0] = {p[1]: (p[3],p[3])}
elif len(p) == 8:
p[0] = {p[1]: (p[3],p[6])}
def p_error(p):
raise TypeError("unknown text at %r" % (p.value,))
parser1 = yacc.yacc()#(debug=0, optimize=0)
def txt2dict(txt):
return parser1.parse(txt, lexer=lexer1)
\ No newline at end of file
from collections import namedtuple
mfql_dict = {}
Var = namedtuple('Var', 'id object Options')
Obj = namedtuple('Obj', 'p_rule p_values')
Func = namedtuple('Func', 'func on')
ElementSeq = namedtuple('ElementSeq', 'txt')
Evaluable = namedtuple('Evaluable', 'operation term_1 term_2')
ReportItem = namedtuple('ReportItem', 'id p_values')
This diff is collapsed.
This diff is collapsed.
import ply.lex as lex
import re
keywords = (
'IDENTIFY', 'WITH', 'AND', 'OR',
'TOLERANCE', 'MINOCC', 'MAXOCC', 'MASSRANGE', 'DBR', 'CHG',
'NOT', 'DA', 'PPM', 'RES', 'DEFINE', 'IN', 'MS1', 'MS2',
'REPORT', 'SUCHTHAT',
'QUERYNAME', 'AS', 'NEUTRALLOSS'
)
# list of token names
tokens = keywords + (
'EQUALS', 'IS', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'LPAREN',
'RPAREN', 'LT', 'LE', 'GT', 'GE', 'IFA', 'IFF', 'NE', 'COMMA', 'SEMICOLON',
'FLOAT', 'STRING', 'ID', 'INTEGER', 'DOT', 'PERCENT', 'LBRACE',
'RBRACE', 'LBRACKET', 'RBRACKET', 'SFSTRING', 'ARROW', 'ARROWR'
)
#
def t_ID(t):
r'[a-zA-Z$][a-zA-Z$0-9]*'
if t.value in keywords or t.value.upper() in keywords:
t.type = t.value.upper()
return t
# regular expression rules for simple tokens
t_EQUALS = r'=='
t_IS = r'='
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACE = r'\['
t_RBRACE = r'\]'
t_LBRACKET= r'\{'
t_RBRACKET= r'\}'
t_LE = r'<='
t_NE = r'<>'
t_LT = r'<'
t_GE = r'>='
t_GT = r'>'
t_ARROW = r'->'
t_ARROWR = r'<~'
t_IFA = r'=>'
t_IFF = r'<=>'
t_COMMA = r'\,'
t_SEMICOLON = r';'
t_FLOAT = r'(\+|-)?((\d*\.\d+)(E[\+-]?\d+)?|([1-9]\d*E[\+-]?\d+))'
t_INTEGER = r'(\+|-)?\d+'
t_STRING = r'\".*?\"'
t_SFSTRING = r'\'.*?\''
t_DOT = r'\.'
t_PERCENT = r'%'
def t_comment(t):
r'[ ]*\043[^\n]*'
t.lexer.lineno += t.value.count("\n")
pass
def t_WS(t):
r'[ \t]+'
pass
def t_WS_NL(t):
r'(([ \t]*)\n)'
t.lexer.lineno += t.value.count("\n")
pass
def t_UNDERSCORE(t):
r'_'
raise SyntaxError("In query No underscore allowed.")
def t_error(t):
if not ord(t.value[0]) == 13:
raise SyntaxError("Illegal character %s (%s) in line %d" % (t.value[0], ord(t.value[0]), t.lexer.lineno))
t.lexer.skip(1)
# build lexer
lexer = lex.lex()#= lex.lex(reflags = re.I, debug = 0, optimize = 0)
Created by PLY version 3.11 (http://www.dabeaz.com/ply)
Grammar
Rule 0 S' -> chemical_equation
Rule 1 chemical_equation -> <empty>
Rule 2 chemical_equation -> species_list
Rule 3 species_list -> species_list species
Rule 4 species_list -> species
Rule 5 species -> SYMBOL
Rule 6 species -> SYMBOL NUM
Rule 7 species -> SYMBOL LPAREN NUM RPAREN
Rule 8 species -> SYMBOL LPAREN NUM DOT DOT NUM RPAREN
Terminals, with rules where they appear
DOT : 8 8
LPAREN : 7 8
NUM : 6 7 8 8
RPAREN : 7 8
SYMBOL : 5 6 7 8
error :
Nonterminals, with rules where they appear
chemical_equation : 0
species : 3 4
species_list : 2 3
Parsing method: LALR
state 0
(0) S' -> . chemical_equation
(1) chemical_equation -> .
(2) chemical_equation -> . species_list
(3) species_list -> . species_list species
(4) species_list -> . species
(5) species -> . SYMBOL
(6) species -> . SYMBOL NUM
(7) species -> . SYMBOL LPAREN NUM RPAREN
(8) species -> . SYMBOL LPAREN NUM DOT DOT NUM RPAREN
$end reduce using rule 1 (chemical_equation -> .)
SYMBOL shift and go to state 4
chemical_equation shift and go to state 1
species_list shift and go to state 2
species shift and go to state 3
state 1
(0) S' -> chemical_equation .
state 2
(2) chemical_equation -> species_list .
(3) species_list -> species_list . species
(5) species -> . SYMBOL
(6) species -> . SYMBOL NUM
(7) species -> . SYMBOL LPAREN NUM RPAREN
(8) species -> . SYMBOL LPAREN NUM DOT DOT NUM RPAREN
$end reduce using rule 2 (chemical_equation -> species_list .)
SYMBOL shift and go to state 4
species shift and go to state 5
state 3
(4) species_list -> species .
SYMBOL reduce using rule 4 (species_list -> species .)
$end reduce using rule 4 (species_list -> species .)
state 4
(5) species -> SYMBOL .
(6) species -> SYMBOL . NUM
(7) species -> SYMBOL . LPAREN NUM RPAREN
(8) species -> SYMBOL . LPAREN NUM DOT DOT NUM RPAREN
SYMBOL reduce using rule 5 (species -> SYMBOL .)
$end reduce using rule 5 (species -> SYMBOL .)
NUM shift and go to state 6
LPAREN shift and go to state 7
state 5
(3) species_list -> species_list species .
SYMBOL reduce using rule 3 (species_list -> species_list species .)
$end reduce using rule 3 (species_list -> species_list species .)
state 6
(6) species -> SYMBOL NUM .
SYMBOL reduce using rule 6 (species -> SYMBOL NUM .)
$end reduce using rule 6 (species -> SYMBOL NUM .)
state 7
(7) species -> SYMBOL LPAREN . NUM RPAREN
(8) species -> SYMBOL LPAREN . NUM DOT DOT NUM RPAREN
NUM shift and go to state 8
state 8
(7) species -> SYMBOL LPAREN NUM . RPAREN
(8) species -> SYMBOL LPAREN NUM . DOT DOT NUM RPAREN
RPAREN shift and go to state 9
DOT shift and go to state 10
state 9
(7) species -> SYMBOL LPAREN NUM RPAREN .
SYMBOL reduce using rule 7 (species -> SYMBOL LPAREN NUM RPAREN .)
$end reduce using rule 7 (species -> SYMBOL LPAREN NUM RPAREN .)
state 10
(8) species -> SYMBOL LPAREN NUM DOT . DOT NUM RPAREN
DOT shift and go to state 11
state 11
(8) species -> SYMBOL LPAREN NUM DOT DOT . NUM RPAREN
NUM shift and go to state 12
state 12
(8) species -> SYMBOL LPAREN NUM DOT DOT NUM . RPAREN
RPAREN shift and go to state 13
state 13
(8) species -> SYMBOL LPAREN NUM DOT DOT NUM RPAREN .
SYMBOL reduce using rule 8 (species -> SYMBOL LPAREN NUM DOT DOT NUM RPAREN .)
$end reduce using rule 8 (species -> SYMBOL LPAREN NUM DOT DOT NUM RPAREN .)
# parsetab.py
# This file is automatically generated. Do not edit.
# pylint: disable=W,C,R
_tabversion = '3.10'
_lr_method = 'LALR'
_lr_signature = 'DOT LPAREN NUM RPAREN SYMBOL\n chemical_equation :\n chemical_equation : species_list\n species_list : species_list speciesspecies_list : species\n species : SYMBOL\n species : SYMBOL NUM\n species : SYMBOL LPAREN NUM RPAREN\n species : SYMBOL LPAREN NUM DOT DOT NUM RPAREN\n '
_lr_action_items = {'$end':([0,1,2,3,4,5,6,9,13,],[-1,0,-2,-4,-5,-3,-6,-7,-8,]),'SYMBOL':([0,2,3,4,5,6,9,13,],[4,4,-4,-5,-3,-6,-7,-8,]),'NUM':([4,7,11,],[6,8,12,]),'LPAREN':([4,],[7,]),'RPAREN':([8,12,],[9,13,]),'DOT':([8,10,],[10,11,]),}
_lr_action = {}
for _k, _v in _lr_action_items.items():
for _x,_y in zip(_v[0],_v[1]):
if not _x in _lr_action: _lr_action[_x] = {}
_lr_action[_x][_k] = _y
del _lr_action_items
_lr_goto_items = {'chemical_equation':([0,],[1,]),'species_list':([0,],[2,]),'species':([0,2,],[3,5,]),}
_lr_goto = {}
for _k, _v in _lr_goto_items.items():
for _x, _y in zip(_v[0], _v[1]):
if not _x in _lr_goto: _lr_goto[_x] = {}
_lr_goto[_x][_k] = _y
del _lr_goto_items
_lr_productions = [
("S' -> chemical_equation","S'",1,None,None,None),
('chemical_equation -> <empty>','chemical_equation',0,'p_chemical_equation','chemParser.py',44),
('chemical_equation -> species_list','chemical_equation',1,'p_chemical_equation','chemParser.py',45),
('species_list -> species_list species','species_list',2,'p_species_list','chemParser.py',54),
('species_list -> species','species_list',1,'p_species','chemParser.py',59),
('species -> SYMBOL','species',1,'p_single_species','chemParser.py',64),
('species -> SYMBOL NUM','species',2,'p_single_species','chemParser.py',65),
('species -> SYMBOL LPAREN NUM RPAREN','species',4,'p_single_species','chemParser.py',66),
('species -> SYMBOL LPAREN NUM DOT DOT NUM RPAREN','species',7,'p_single_species','chemParser.py',67),
]
This diff is collapsed.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
class Targets_util():
mass_of = {
'C' : 12,
'H' : 1.0078250321,
'N' : 14.0030740052,
'P' : 30.97376151,
'O' : 15.9949146221,
'S' : 31.97207069
}
def __init__(self,elements):
self.elements = elements
self._ranges = {}
self._df = None
self._build()
def _build(self):
self._makeRanges()
self._makeDF()
self._cal_M()
self._cal_dbr()
self._cal_chem()
def _makeRanges(self):
for name, bounds in self.elements.items():
self._ranges[name] = np.arange(bounds[0],bounds[1]+1)
def _makeDF(self):
index = pd.MultiIndex.from_product(self._ranges.values(), names = self._ranges.keys())
self._df = pd.DataFrame(index = index).reset_index()
def _cal_M(self):
self._df['m'] = 0
for colName in self._df.columns.intersection(self.mass_of.keys()):
self._df['m'] += self._df[colName] * self.mass_of[colName]
def _cal_dbr(self):
x = self._df.get('Cl',0) + self._df.get('Br', 0) + self._df.get('I',0) + self._df.get('F',0)
self._df['dbr'] = self._df.get('C', 0) + 1 - self._df.get('H', 0)/2 - x/2 + self._df.get('N', 0)/2
def _cal_chem(self):
#https://stackoverflow.com/questions/52673285/performance-of-pandas-apply-vs-np-vectorize-to-create-new-column-from-existing-c
self._df['chem'] = ''
for colName in self._df.columns.intersection(self.mass_of.keys()):
self._df['chem'] += colName + self._df[colName].apply(str) + ' '
self._df['chem']= self._df['chem'].str[:-1]
def set_dbr(self, dbr_u, dbr_l):
self._df = self._df.loc[self._df.dbr.between(dbr_u, dbr_l)]
@staticmethod
def set_max_daltons(matchesDF, max_daltons = 0.01):
matchesDF['err'] = matchesDF['m'] - matchesDF['target']
matchesDF['err'] = matchesDF['err'].abs()
matchesDF['min_err'] = matchesDF.groupby(['id','chem'])['err'].transform('min')
matchesDF = matchesDF.loc[matchesDF['err'] == matchesDF['min_err']]
matchesDF.drop('min_err', axis = 1, inplace=True)
matchesDF['ppm'] = matchesDF['err'] / matchesDF['target'] * 1_000_000
matchesDF = matchesDF.loc[matchesDF['err'] <= max_daltons]
# matchesDF.drop('err', axis = 1, inplace=True)
return matchesDF
@staticmethod
def makeAllCombo(pr_df, fr_df):
pr_df = pr_df.add_prefix('PR_')
fr_df = fr_df.add_prefix('FR_')
pr_df.loc[:, 'key']=0
fr_df.loc[:, 'key']=0
in_df = pr_df.merge(fr_df, on='key')
pr_df.drop('key', 1, inplace=True)
fr_df.drop('key', 1, inplace=True)
in_df.drop('key',1, inplace = True)
return in_df
@staticmethod
def devideAllCombo(all_df):
all_df.sort_values(['PR_ppm', 'FR_ppm'], inplace = True)
cols = all_df.columns
pr_cols = [col for col in cols if col.startswith('PR_')]
fr_cols = [col for col in cols if col.startswith('FR_')]
pr_df = all_df.loc[:,pr_cols]
fr_df = all_df.loc[:,fr_cols]
pr_df.sort_values('PR_ppm', inplace=True)
pr_df.drop_duplicates(inplace = True)
fr_df.drop_duplicates(inplace = True)
return pr_df, fr_df
@staticmethod
def suchThat(df, query):
return df.query(query)
@staticmethod
def summaryDF(df, prefix='PR_', quantile=0.25):
groups = df.groupby([prefix+'C', prefix+'dbr'])
columns = ['C_dbr', 'ppm_mean', 'i_mean', 'i_rsd', 'count']
columns = [prefix+col for col in columns]
tups = []
for e,g_df in groups:
tup = (e,
g_df[prefix+'ppm'].mean(),
g_df[prefix+'i'].mean(),
g_df[prefix+'i'].std()/g_df[prefix+'i'].mean() *100,
g_df[prefix+'m'].count())
tups.append(tup)
df_summary = pd.DataFrame(tups, columns=columns)
sort_col = columns[1]
df_summary.sort_values(sort_col, ascending =True, inplace = True)
sort_smallest = df_summary[sort_col] <= df_summary[sort_col].quantile(quantile)
return df_summary.loc[sort_smallest]
@staticmethod
def lollipop_plot(m, i):
# https://python-graph-gallery.com/180-basic-lollipop-plot/
# https://stackoverflow.com/questions/1358977/how-to-make-several-plots-on-a-single-page-using-matplotlib
(markerline, stemlines, baseline) =plt.stem(m, i)#, markerfmt=' ')
plt.setp(baseline, visible=False)
return plt
@staticmethod
def showAll_lollipop( df, prefix='PR_', sample = False):
groups = df.groupby([prefix+'C', prefix+'dbr'])
for e,g_df in groups:
plt = Targets_util.lollipop_plot(g_df[prefix+'m'],g_df[prefix+'i'])
plt.title(e)
plt.xlim([g_df[prefix+'m'].min(),g_df[prefix+'m'].max()])
plt.show()
if sample: break #ony show one
import unittest
from LX2_targets import MFQL_util
class TestLX2_MS_reader(unittest.TestCase):
def setUp(self):
elements = {'C':(30,300),'H':(40,300),'O':(10,10),'N':(1,1),'P':(1,1) }
self.test = MFQL_util(elements)
def test_makeAllCombo(self):
pr = self.test._df.sample(n=10, random_state = 1)
fr = self.test._df.sample(n=3, random_state = 2)
df = self.test.makeAllCombo(pr,fr)
in_df = self.test.suchThat(df,'C_pr % 2 == 0')
self.assertEqual(len(df), 30)
self.assertEqual(len(in_df), 21)
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
# lextab.py. This file automatically created by PLY (version 3.11). Don't edit!
_tabversion = '3.10'
_lextokens = set(('AND', 'ARROW', 'ARROWR', 'AS', 'CHG', 'COMMA', 'DA', 'DBR', 'DEFINE', 'DIVIDE', 'DOT', 'EQUALS', 'FLOAT', 'GE', 'GT', 'ID', 'IDENTIFY', 'IFA', 'IFF', 'IN', 'INTEGER', 'IS', 'LBRACE', 'LBRACKET', 'LE', 'LPAREN', 'LT', 'MASSRANGE', 'MAXOCC', 'MINOCC', 'MINUS', 'MS1', 'MS2', 'NE', 'NEUTRALLOSS', 'NOT', 'OR', 'PERCENT', 'PLUS', 'PPM', 'QUERYNAME', 'RBRACE', 'RBRACKET', 'REPORT', 'RES', 'RPAREN', 'SEMICOLON', 'SFSTRING', 'STRING', 'SUCHTHAT', 'TIMES', 'TOLERANCE', 'WITH'))
_lexreflags = 2
_lexliterals = ''
_lexstateinfo = {'INITIAL': 'inclusive'}
_lexstatere = {'INITIAL': [('(?P<t_ID>[a-zA-Z$][a-zA-Z$0-9]*)|(?P<t_comment>[ ]*\\043[^\\n]*)|(?P<t_WS>[ \\t]+)|(?P<t_WS_NL>(([ \\t]*)\\n))|(?P<t_UNDERSCORE>_)|(?P<t_FLOAT>(\\+|-)?((\\d*\\.\\d+)(E[\\+-]?\\d+)?|([1-9]\\d*E[\\+-]?\\d+)))|(?P<t_INTEGER>(\\+|-)?\\d+)|(?P<t_STRING>\\".*?\\")|(?P<t_SFSTRING>\\\'.*?\\\')|(?P<t_IFF><=>)|(?P<t_EQUALS>==)|(?P<t_PLUS>\\+)|(?P<t_TIMES>\\*)|(?P<t_LPAREN>\\()|(?P<t_RPAREN>\\))|(?P<t_LBRACE>\\[)|(?P<t_RBRACE>\\])|(?P<t_LBRACKET>\\{)|(?P<t_RBRACKET>\\})|(?P<t_LE><=)|(?P<t_NE><>)|(?P<t_GE>>=)|(?P<t_ARROW>->)|(?P<t_ARROWR><~)|(?P<t_IFA>=>)|(?P<t_COMMA>\\,)|(?P<t_DOT>\\.)|(?P<t_IS>=)|(?P<t_MINUS>-)|(?P<t_DIVIDE>/)|(?P<t_LT><)|(?P<t_GT>>)|(?P<t_SEMICOLON>;)|(?P<t_PERCENT>%)', [None, ('t_ID', 'ID'), ('t_comment', 'comment'), ('t_WS', 'WS'), ('t_WS_NL', 'WS_NL'), None, None, ('t_UNDERSCORE', 'UNDERSCORE'), (None, 'FLOAT'), None, None, None, None, None, (None, 'INTEGER'), None, (None, 'STRING'), (None, 'SFSTRING'), (None, 'IFF'), (None, 'EQUALS'), (None, 'PLUS'), (None, 'TIMES'), (None, 'LPAREN'), (None, 'RPAREN'), (None, 'LBRACE'), (None, 'RBRACE'), (None, 'LBRACKET'), (None, 'RBRACKET'), (None, 'LE'), (None, 'NE'), (None, 'GE'), (None, 'ARROW'), (None, 'ARROWR'), (None, 'IFA'), (None, 'COMMA'), (None, 'DOT'), (None, 'IS'), (None, 'MINUS'), (None, 'DIVIDE'), (None, 'LT'), (None, 'GT'), (None, 'SEMICOLON'), (None, 'PERCENT')])]}
_lexstateignore = {'INITIAL': ''}
_lexstateerrorf = {'INITIAL': 't_error'}
_lexstateeoff = {}
......@@ -482,7 +482,7 @@ if __name__ == '__main__':
FA1C = "%d" % "((FA2.chemsc)[C])";
FA1DB = "%d" % "((FA2.chemsc)[db] - 1.5)";
FA1ERR = "%2.2f" % "(FA2.errppm)";
FA1I = FA2.intensity;rrererrrrrrrrrrre
FA1I = FA2.intensity;
FA2M = FA1.mass;
FA2C = "%d" % "((FA1.chemsc)[C])";
......
......@@ -83,74 +83,4 @@ def t_error(t):
# build lexer
lexer = lex.lex(reflags = re.I, debug = 0, optimize = 1)
if __name__ == '__main__':
mfql = '''
##########################################################
# Identify PC SPCCIES #
##########################################################
QUERYNAME = PCFAS;
DEFINE pr = 'C[30..80] H[40..300] O[10] N[1] P[1]' WITH DBR = (2.5,14.5), CHG = -1;
DEFINE FA1 = 'C[10..40] H[20..100] O[2]' WITH DBR = (1.5,7.5), CHG = -1;
DEFINE FA2 ='C[10..40] H[20..100] O[2]' WITH DBR = (1.5,7.5), CHG = -1;
IDENTIFY
pr IN MS1- AND
FA1 IN MS2- AND
FA2 IN MS2-
SUCHTHAT
isOdd(pr.chemsc[H]) AND
isOdd(pr.chemsc[db]*2) AND
FA1.chemsc + FA2.chemsc + 'C9 H19 N1 O6 P1' == pr.chemsc
REPORT
PRM = pr.mass;
EC = pr.chemsc;
CLASS = "PC" % "()";
PRC = "%d" % "((pr.chemsc)[C] - 9)";
PRDB = "%d" % "((pr.chemsc)[db] - 2.5)";
PROH = "%d" % "((pr.chemsc)[O] - 10)";
SPECIE = "PC %d:%d:%d" % "((pr.chemsc)[C]-9, pr.chemsc[db] - 2.5, pr.chemsc[O]-10)";
PRERR = "%2.2f" % "(pr.errppm)";
PRI = pr.intensity;
FA1M = FA2.mass;
FA1C = "%d" % "((FA2.chemsc)[C])";
FA1DB = "%d" % "((FA2.chemsc)[db] - 1.5)";
FA1ERR = "%2.2f" % "(FA2.errppm)";
FA1I = FA2.intensity;
FA2M = FA1.mass;
FA2C = "%d" % "((FA1.chemsc)[C])";
FA2DB = "%d" % "((FA1.chemsc)[db] - 1.5)";
FA2ERR = "%2.2f" % "(FA1.errppm)";
FA2I = FA1.intensity;
;
################ end script ##################
'''
lexer.input(mfql)
expected = '''
[LexToken(QUERYNAME,'QUERYNAME',6,160), LexToken(IS,'=',6,170), LexToken(ID,'PCFAS',6,172), LexToken(SEMICOLON,';',6,177), LexToken(DEFINE,'DEFINE',7,187), LexToken(ID,'pr',7,194), LexToken(IS,'=',7,197), LexToken(SFSTRING,"'C[30..80] H[40..300] O[10] N[1] P[1]'",7,199), LexToken(WITH,'WITH',7,238), LexToken(DBR,'DBR',7,243), LexToken(IS,'=',7,247), LexToken(LPAREN,'(',7,249), LexToken(FLOAT,'2.5',7,250), LexToken(COMMA,',',7,253), LexToken(FLOAT,'14.5',7,254), LexToken(RPAREN,')',7,258), LexToken(COMMA,',',7,259), LexToken(CHG,'CHG',7,261), LexToken(IS,'=',7,265), LexToken(INTEGER,'-1',7,267), LexToken(SEMICOLON,';',7,269), LexToken(DEFINE,'DEFINE',8,279), LexToken(ID,'FA1',8,286), LexToken(IS,'=',8,290), LexToken(SFSTRING,"'C[10..40] H[20..100] O[2]'",8,292), LexToken(WITH,'WITH',8,320), LexToken(DBR,'DBR',8,325), LexToken(IS,'=',8,329), LexToken(LPAREN,'(',8,331), LexToken(FLOAT,'1.5',8,332), LexToken(COMMA,',',8,335), LexToken(FLOAT,'7.5',8,336), LexToken(RPAREN,')',8,339), LexToken(COMMA,',',8,340), LexToken(CHG,'CHG',8,342), LexToken(IS,'=',8,346), LexToken(INTEGER,'-1',8,348), LexToken(SEMICOLON,';',8,350), LexToken(DEFINE,'DEFINE',9,360), LexToken(ID,'FA2',9,367), LexToken(IS,'=',9,371), LexToken(SFSTRING,"'C[10..40] H[20..100] O[2]'",9,372), LexToken(WITH,'WITH',9,400), LexToken(DBR,'DBR',9,405), LexToken(IS,'=',9,409), LexToken(LPAREN,'(',9,411), LexToken(FLOAT,'1.5',9,412), LexToken(COMMA,',',9,415), LexToken(FLOAT,'7.5',9,416), LexToken(RPAREN,')',9,419), LexToken(COMMA,',',9,420), LexToken(CHG,'CHG',9,422), LexToken(IS,'=',9,426), LexToken(INTEGER,'-1',9,428), LexToken(SEMICOLON,';',9,430), LexToken(IDENTIFY,'IDENTIFY',11,437), LexToken(ID,'pr',12,454), LexToken(IN,'IN',12,457), LexToken(MS1,'MS1',12,460), LexToken(MINUS,'-',12,463), LexToken(AND,'AND',12,465), LexToken(ID,'FA1',13,477), LexToken(IN,'IN',13,481), LexToken(MS2,'MS2',13,484), LexToken(MINUS,'-',13,487), LexToken(AND,'AND',13,489), LexToken(ID,'FA2',14,501), LexToken(IN,'IN',14,505), LexToken(MS2,'MS2',14,508), LexToken(MINUS,'-',14,511), LexToken(SUCHTHAT,'SUCHTHAT',16,518), LexToken(ID,'isOdd',17,535), LexToken(LPAREN,'(',17,540), LexToken(ID,'pr',17,541), LexToken(DOT,'.',17,543), LexToken(ID,'chemsc',17,544), LexToken(LBRACE,'[',17,550), LexToken(ID,'H',17,551), LexToken(RBRACE,']',17,552), LexToken(RPAREN,')',17,553), LexToken(AND,'AND',17,555), LexToken(ID,'isOdd',18,567), LexToken(LPAREN,'(',18,572), LexToken(ID,'pr',18,573), LexToken(DOT,'.',18,575), LexToken(ID,'chemsc',18,576), LexToken(LBRACE,'[',18,582), LexToken(ID,'db',18,583), LexToken(RBRACE,']',18,585), LexToken(TIMES,'*',18,586), LexToken(INTEGER,'2',18,587), LexToken(RPAREN,')',18,588), LexToken(AND,'AND',18,590), LexToken(ID,'FA1',19,602), LexToken(DOT,'.',19,605), LexToken(ID,'chemsc',19,606), LexToken(PLUS,'+',19,613), LexToken(ID,'FA2',19,615), LexToken(DOT,'.',19,618), LexToken(ID,'chemsc',19,619), LexToken(PLUS,'+',19,626), LexToken(SFSTRING,"'C9 H19 N1 O6 P1'",19,628), LexToken(EQUALS,'==',19,646), LexToken(ID,'pr',19,649), LexToken(DOT,'.',19,651), LexToken(ID,'chemsc',19,652), LexToken(REPORT,'REPORT',21,664), LexToken(ID,'PRM',22,679), LexToken(IS,'=',22,683), LexToken(ID,'pr',22,685), LexToken(DOT,'.',22,687), LexToken(ID,'mass',22,688), LexToken(SEMICOLON,';',22,692), LexToken(ID,'EC',23,702), LexToken(IS,'=',23,705), LexToken(ID,'pr',23,707), LexToken(DOT,'.',23,709), LexToken(ID,'chemsc',23,710), LexToken(SEMICOLON,';',23,716), LexToken(ID,'CLASS',24,726), LexToken(IS,'=',24,732), LexToken(STRING,'"PC"',24,734), LexToken(PERCENT,'%',24,739), LexToken(STRING,'"()"',24,741), LexToken(SEMICOLON,';',24,745), LexToken(ID,'PRC',26,756), LexToken(IS,'=',26,760), LexToken(STRING,'"%d"',26,762), LexToken(PERCENT,'%',26,767), LexToken(STRING,'"((pr.chemsc)[C] - 9)"',26,769), LexToken(SEMICOLON,';',26,791), LexToken(ID,'PRDB',27,801), LexToken(IS,'=',27,806), LexToken(STRING,'"%d"',27,808), LexToken(PERCENT,'%',27,813), LexToken(STRING,'"((pr.chemsc)[db] - 2.5)"',27,815), LexToken(SEMICOLON,';',27,840), LexToken(ID,'PROH',28,850), LexToken(IS,'=',28,855), LexToken(STRING,'"%d"',28,857), LexToken(PERCENT,'%',28,862), LexToken(STRING,'"((pr.chemsc)[O] - 10)"',28,864), LexToken(SEMICOLON,';',28,887), LexToken(ID,'SPECIE',29,897), LexToken(IS,'=',29,904), LexToken(STRING,'"PC %d:%d:%d"',29,906), LexToken(PERCENT,'%',29,920), LexToken(STRING,'"((pr.chemsc)[C]-9, pr.chemsc[db] - 2.5, pr.chemsc[O]-10)"',29,922), LexToken(SEMICOLON,';',29,980), LexToken(ID,'PRERR',31,995), LexToken(IS,'=',31,1001), LexToken(STRING,'"%2.2f"',31,1003), LexToken(PERCENT,'%',31,1011), LexToken(STRING,'"(pr.errppm)"',31,1013), LexToken(SEMICOLON,';',31,1026), LexToken(ID,'PRI',32,1036), LexToken(IS,'=',32,1040), LexToken(ID,'pr',32,1042), LexToken(DOT,'.',32,1044), LexToken(ID,'intensity',32,1045), LexToken(SEMICOLON,';',32,1054), LexToken(ID,'FA1M',34,1065), LexToken(IS,'=',34,1070), LexToken(ID,'FA2',34,1072), LexToken(DOT,'.',34,1075), LexToken(ID,'mass',34,1076), LexToken(SEMICOLON,';',34,1080), LexToken(ID,'FA1C',35,1091), LexToken(IS,'=',35,1096), LexToken(STRING,'"%d"',35,1098), LexToken(PERCENT,'%',35,1103), LexToken(STRING,'"((FA2.chemsc)[C])"',35,1105), LexToken(SEMICOLON,';',35,1124), LexToken(ID,'FA1DB',36,1134), LexToken(IS,'=',36,1140), LexToken(STRING,'"%d"',36,1142), LexToken(PERCENT,'%',36,1147), LexToken(STRING,'"((FA2.chemsc)[db] - 1.5)"',36,1149), LexToken(SEMICOLON,';',36,1175), LexToken(ID,'FA1ERR',37,1190), LexToken(IS,'=',37,1197), LexToken(STRING,'"%2.2f"',37,1199), LexToken(PERCENT,'%',37,1207), LexToken(STRING,'"(FA2.errppm)"',37,1209), LexToken(SEMICOLON,';',37,1223), LexToken(ID,'FA1I',38,1233), LexToken(IS,'=',38,1238), LexToken(ID,'FA2',38,1240), LexToken(DOT,'.',38,1243), LexToken(ID,'intensity',38,1244), LexToken(SEMICOLON,';',38,1253), LexToken(ID,'FA2M',40,1264), LexToken(IS,'=',40,1269), LexToken(ID,'FA1',40,1271), LexToken(DOT,'.',40,1274), LexToken(ID,'mass',40,1275), LexToken(SEMICOLON,';',40,1279), LexToken(ID,'FA2C',41,1290), LexToken(IS,'=',41,1295), LexToken(STRING,'"%d"',41,1297), LexToken(PERCENT,'%',41,1302), LexToken(STRING,'"((FA1.chemsc)[C])"',41,1304), LexToken(SEMICOLON,';',41,1323), LexToken(ID,'FA2DB',42,1333), LexToken(IS,'=',42,1339), LexToken(STRING,'"%d"',42,1341), LexToken(PERCENT,'%',42,1346), LexToken(STRING,'"((FA1.chemsc)[db] - 1.5)"',42,1348), LexToken(SEMICOLON,';',42,1374), LexToken(ID,'FA2ERR',43,1389), LexToken(IS,'=',43,1396), LexToken(STRING,'"%2.2f"',43,1398), LexToken(PERCENT,'%',43,1406), LexToken(STRING,'"(FA1.errppm)"',43,1408), LexToken(SEMICOLON,';',43,1422), LexToken(ID,'FA2I',44,1432), LexToken(IS,'=',44,1437), LexToken(ID,'FA1',44,1439), LexToken(DOT,'.',44,1442), LexToken(ID,'intensity',44,1443), LexToken(SEMICOLON,';',44,1452), LexToken(SEMICOLON,';',45,1462)]
'''
expected = expected.strip()
result = []
# Tokenize
while True:
tok = lexer.token()
if not tok:
break # No more input
result.append(tok)
assert (expected == str(result).strip())
lexer = lex.lex(reflags = re.I, debug = 0, optimize = 1)
\ No newline at end of file
This diff is collapsed.
......@@ -14,6 +14,7 @@ def estimate(file, bins = 50):
# as in https://stackoverflow.com/questions/42006058/binning-data-in-a-pandas-dataframe-into-intervals
# aslo try and use https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Interval.html
#and cummax
#bins may be root of number of samples
df['m_bin'] = pd.cut(df.m, bins)
idxmax = df.groupby('m_bin')['r'].idxmin()
......@@ -25,9 +26,9 @@ def estimate(file, bins = 50):
df['m_diff'] = df.groupby('scanNum')["m"].diff()
# as in https://en.wikipedia.org/wiki/Resolution_(mass_spectrometry)
#https://en.wikipedia.org/wiki/Full_width_at_half_maximum 2.63
# df['est_r'] = df['m']/df['m_diff']
df['est_r'] = df['m'].min() / df['m_diff'] # not sure why its one magnitude lower
#https://en.wikipedia.org/wiki/Full_width_at_half_maximum in sigmas 2.355/3
#because peak to peak is 3 sigma, so m_diff/3 =1 sigma
df['est_r'] = df['m'].min()/(df['m_diff'] * 2.355/3 ) #test
idxmax1 = df.groupby('m_bin')['est_r'].idxmax()
# df.iloc[idxmax1.values].plot.scatter(x='m', y='est_r')
......