Commit 1d9ad0dc authored by Eduardo Miranda's avatar Eduardo Miranda

nils file reader improvement

parent 47e72555
...@@ -249,9 +249,8 @@ def make_info_file(data_file, **kwargs): ...@@ -249,9 +249,8 @@ def make_info_file(data_file, **kwargs):
#Pickle object #Pickle object
fh = open(data_file + '.mzi', 'w') with open(data_file + '.mzi', 'w') as fh:
cPickle.dump(info_list, fh) cPickle.dump(info_list, fh)
fh.close()
else: else:
import lx.fileReader.mzAPI.mzML import lx.fileReader.mzAPI.mzML
lx.fileReader.mzAPI.mzML.make_info_file(data_file) lx.fileReader.mzAPI.mzML.make_info_file(data_file)
...@@ -352,7 +351,7 @@ class mzFile(object): ...@@ -352,7 +351,7 @@ class mzFile(object):
""" """
if data_file.lower().endswith('.lnk'): if data_file.lower().endswith('.lnk'):
data_file = follow_link(data_file) data_file = self.follow_link(data_file)
if data_file.lower().startswith('http://'): if data_file.lower().startswith('http://'):
import lx.fileReader.mzAPI.mzURL import lx.fileReader.mzAPI.mzURL
...@@ -430,16 +429,17 @@ class mzFile(object): ...@@ -430,16 +429,17 @@ class mzFile(object):
raise NotImplementedError('Subclasses must implement this method') raise NotImplementedError('Subclasses must implement this method')
def scan(self, time): def scan(self, scan_id, time):
"""Gets scan based on the specified scan time """Gets scan based on the specified scan id (id attribute in mzML) or the time
The id will be preferred over the time, and is used as a lookup key in the scan cache.
The scan is a list of (mz, intensity, resolution, baseline, The scan is a list of (mz, intensity, resolution, baseline,
noise, charge) tuples. Actually only recent versions of the raw noise, charge) tuples. Actually only recent versions of the raw
file format returns all of those. Normally only mz and file format returns all of those. Normally only mz and
intensity are filled, the others set to zero. intensity are filled, the others set to zero.
Example: Example:
>>> scan = myPeakFile.scan(20.035) >>> scan = myPeakFile.scan("controllerType=0 controllerNumber=1 scan=3161", 23.21)
""" """
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#MRM_Q3MS_TEXT As String = "Q3MS " #MRM_Q3MS_TEXT As String = "Q3MS "
#MRM_SRM_TEXT As String = "SRM ms2" #MRM_SRM_TEXT As String = "SRM ms2"
#MRM_FullNL_TEXT As String = "Full cnl " ' MRM neutral loss #MRM_FullNL_TEXT As String = "Full cnl " ' MRM neutral loss
import mmap
from lx.exceptions import LipidXException from lx.exceptions import LipidXException
...@@ -130,9 +131,8 @@ def make_info_file(data_file): ...@@ -130,9 +131,8 @@ def make_info_file(data_file):
m = mzFile(data_file) m = mzFile(data_file)
m._build_info_scans() m._build_info_scans()
fh = open(data_file + '.mzi', 'wb') with open(data_file + '.mzi', 'wb') as fh:
cPickle.dump(m._info_scans, fh) cPickle.dump(m._info_scans, fh)
fh.close()
class mzFile(mzAPImzFile): class mzFile(mzAPImzFile):
...@@ -147,7 +147,6 @@ class mzFile(mzAPImzFile): ...@@ -147,7 +147,6 @@ class mzFile(mzAPImzFile):
use, there's the _build_info_scans method, but in general it makes use, there's the _build_info_scans method, but in general it makes
more sense to create an .mzi file. more sense to create an .mzi file.
""" """
_xp_time = etree.XPath(('./mz:scanList/mz:scan/' _xp_time = etree.XPath(('./mz:scanList/mz:scan/'
'mz:cvParam[@name="scan start time"]/' 'mz:cvParam[@name="scan start time"]/'
'attribute::value'), 'attribute::value'),
...@@ -178,6 +177,7 @@ class mzFile(mzAPImzFile): ...@@ -178,6 +177,7 @@ class mzFile(mzAPImzFile):
namespaces=NSd, smart_strings=False) namespaces=NSd, smart_strings=False)
_xp_tic = etree.XPath('./mz:cvParam[@name="total ion current"]/attribute::value', _xp_tic = etree.XPath('./mz:cvParam[@name="total ion current"]/attribute::value',
namespaces=NSd, smart_strings=False) namespaces=NSd, smart_strings=False)
scan_map = {}
def __init__(self, data_file, **kwargs): def __init__(self, data_file, **kwargs):
self.file_type = 'mzml' self.file_type = 'mzml'
...@@ -186,18 +186,25 @@ class mzFile(mzAPImzFile): ...@@ -186,18 +186,25 @@ class mzFile(mzAPImzFile):
if data_file.lower().endswith('.mzml.gz'): if data_file.lower().endswith('.mzml.gz'):
self.fileobj = gzip.GzipFile(data_file, mode='rb') self.fileobj = gzip.GzipFile(data_file, mode='rb')
else: else:
self.fileobj = open(data_file, mode='rb') with open(data_file, "r+b") as f:
self.mmap_fileobj = f
self.fileobj = mmap.mmap(f.fileno(), 0)
if os.path.exists(data_file + '.mzi'): if os.path.exists(data_file + '.mzi'):
self._info_file = data_file + '.mzi' self._info_file = data_file + '.mzi'
info_fh = open(self._info_file) with open(self._info_file) as info_fh:
self._info_scans = cPickle.load(info_fh) self._info_scans = cPickle.load(info_fh)
info_fh.close()
else: else:
self._info_file = None self._info_file = None
self._info_scans = None
def close(self): def close(self):
self.fileobj.close() self.fileobj.close()
if self.mmap_fileobj is not None:
self.mmap_fileobj.close()
self._info_file = None
self._info_scans = None
self.scan_map = {}
def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999): def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999):
if start_time is None: if start_time is None:
...@@ -207,7 +214,7 @@ class mzFile(mzAPImzFile): ...@@ -207,7 +214,7 @@ class mzFile(mzAPImzFile):
if self._info_file: if self._info_file:
return [ (i['time'], i['mz']) for i in self._info_scans return [ (i['time'], i['mz']) for i in self._info_scans
if (start_time <= i['time'] and ((not stop_time) or i['time'] <= stop_time)) if (start_time <= i['time'] and ((not stop_time) or i['time'] <= stop_time))
and (i['scan_mode'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ] and (i['scan_type'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ]
scan_list = [] scan_list = []
...@@ -263,6 +270,10 @@ class mzFile(mzAPImzFile): ...@@ -263,6 +270,10 @@ class mzFile(mzAPImzFile):
self.fileobj.seek(0) self.fileobj.seek(0)
context = etree.iterparse(self.fileobj, events=('end',), context = etree.iterparse(self.fileobj, events=('end',),
tag='%sspectrum' % NS) tag='%sspectrum' % NS)
n_ms1 = 0
n_ms2 = 0
n_ms1_filtered = 0
n_ms2_filtered = 0
for event, elem in context: for event, elem in context:
xt = self._xp_time(elem) xt = self._xp_time(elem)
if xt: if xt:
...@@ -311,7 +322,7 @@ class mzFile(mzAPImzFile): ...@@ -311,7 +322,7 @@ class mzFile(mzAPImzFile):
elif self._xp_frg_pis(elem): elif self._xp_frg_pis(elem):
p = float(self._xp_frg_pis(elem)[0]) p = float(self._xp_frg_pis(elem)[0])
else: else:
print "this ms2 didn't have a precursor m/z or a fragment scan m/z...", elem.get("id") print "Skipping scan: '%s' ; Could not find a precursor or fragment!" % (elem.get('id'))
else: else:
p = precursor p = precursor
...@@ -325,6 +336,20 @@ class mzFile(mzAPImzFile): ...@@ -325,6 +336,20 @@ class mzFile(mzAPImzFile):
scan_name = elem.get('id') scan_name = elem.get('id')
scan_info.append((time, mz, scan_name, 'MS2', scan_mode, polarity, total_ic, 0, 0)) scan_info.append((time, mz, scan_name, 'MS2', scan_mode, polarity, total_ic, 0, 0))
# caching here changes the results :-/
# load mz and intensity arrays proactively
mz, it = zip(*self._scan_from_spec_node(elem, xt))
empty = [0 for i in range(len(mz))]
self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty)
n_ms2 = n_ms2 + 1
else:
#print "Skipping scan: '%s' ; m/z='%f' is out of defined range!"%(elem.get('id'), mz)
n_ms2_filtered = n_ms2_filtered + 1
else:
#print "Skipping scan: '%s' ; Could not find a precursor!"%(elem.get('id'))
n_ms2_filtered = n_ms2_filtered + 1
else: else:
if self._xp_prof(elem): if self._xp_prof(elem):
scan_mode = 'p' scan_mode = 'p'
...@@ -333,10 +358,23 @@ class mzFile(mzAPImzFile): ...@@ -333,10 +358,23 @@ class mzFile(mzAPImzFile):
scan_name = elem.get('id') scan_name = elem.get('id')
scan_info.append((time, 0.0, scan_name, 'MS1', scan_mode, polarity, total_ic, 0, 0)) scan_info.append((time, 0.0, scan_name, 'MS1', scan_mode, polarity, total_ic, 0, 0))
# caching here changes the results :-/
# load mz and intensity arrays proactively
mz, it = zip(*self._scan_from_spec_node(elem, xt))
empty = [0 for i in range(len(mz))]
self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty)
n_ms1 = n_ms1 + 1
else:
n_ms1_filtered = n_ms1_filtered + 1
else: else:
print "this spectrum didn't have a scan time...", elem.get("id") print "this spectrum didn't have a scan time...", elem.get("id")
n_ms1_filtered = n_ms1_filtered + 1
elem.clear() elem.clear()
#total_scans = n_ms1+n_ms1_filtered+n_ms2+n_ms2_filtered
#print "Loaded %d of %d scan info objects, %d MS1 scans, %d MS2 scans, filtered %d MS1 scans and %d MS2 scans."%(len(scan_info), total_scans, n_ms1, n_ms2, n_ms1_filtered, n_ms2_filtered)
return scan_info return scan_info
def scan_time_from_scan_name(self, scan_name): def scan_time_from_scan_name(self, scan_name):
...@@ -361,10 +399,13 @@ class mzFile(mzAPImzFile): ...@@ -361,10 +399,13 @@ class mzFile(mzAPImzFile):
else: else:
print "scan not found:", scan_name print "scan not found:", scan_name
def scan(self, time): def scan(self, scan_id, time):
# this implementation takes a few seconds and uses very little memory # this implementation takes a few seconds and uses very little memory
# (by keeping only the current closest scan) # (by keeping only the current closest scan)
if self.scan_map.has_key(scan_id):
return self.scan_map[scan_id]
if self._info_file: if self._info_file:
closest_item = self._info_scans.closest(key='time', value=time) closest_item = self._info_scans.closest(key='time', value=time)
spec_start, spec_size = closest_item['offset'], closest_item['size'] spec_start, spec_size = closest_item['offset'], closest_item['size']
...@@ -374,7 +415,6 @@ class mzFile(mzAPImzFile): ...@@ -374,7 +415,6 @@ class mzFile(mzAPImzFile):
mz, it = zip(*self._scan_from_spec_node(spec, closest_item['time'], prefix=False)) mz, it = zip(*self._scan_from_spec_node(spec, closest_item['time'], prefix=False))
empty = [0 for i in range(len(mz))] empty = [0 for i in range(len(mz))]
return zip(list(mz), list(it), empty, empty, empty, empty) return zip(list(mz), list(it), empty, empty, empty, empty)
#return self._scan_from_spec_node(spec, closest_item['time'], prefix=False)
self.fileobj.seek(0) self.fileobj.seek(0)
context = etree.iterparse(self.fileobj, events=('end',), context = etree.iterparse(self.fileobj, events=('end',),
...@@ -600,7 +640,7 @@ class mzFileInMemory: ...@@ -600,7 +640,7 @@ class mzFileInMemory:
# now there's nothing to do, because there's no handle on the mzML file # now there's nothing to do, because there's no handle on the mzML file
#pass #pass
def scan_list(self, start_timeNone, stop_time=None, start_mz=0, stop_mz=99999): def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999):
if start_time is None or stop_time is None: if start_time is None or stop_time is None:
(file_start_time, file_stop_time) = self.time_range() (file_start_time, file_stop_time) = self.time_range()
if start_time is None: if start_time is None:
......
...@@ -113,7 +113,7 @@ def add_Sample( ...@@ -113,7 +113,7 @@ def add_Sample(
else: else:
polarity = pol polarity = pol
scan = mz_file.scan(t) scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans # don't consider empty scans
if len(scan) == 0: if len(scan) == 0:
...@@ -159,7 +159,7 @@ def add_Sample( ...@@ -159,7 +159,7 @@ def add_Sample(
else: else:
polarity = pol polarity = pol
scan = mz_file.scan(t) scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans # don't consider empty scans
if len(scan) == 0: if len(scan) == 0:
...@@ -571,7 +571,7 @@ def add_Sample_AVG( ...@@ -571,7 +571,7 @@ def add_Sample_AVG(
else: else:
polarity = pol polarity = pol
scan = mz_file.scan(t) scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans # don't consider empty scans
if len(scan) == 0: if len(scan) == 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment