Commit 1d9ad0dc authored by Eduardo Miranda's avatar Eduardo Miranda

nils file reader improvement

parent 47e72555
......@@ -249,9 +249,8 @@ def make_info_file(data_file, **kwargs):
#Pickle object
fh = open(data_file + '.mzi', 'w')
cPickle.dump(info_list, fh)
fh.close()
with open(data_file + '.mzi', 'w') as fh:
cPickle.dump(info_list, fh)
else:
import lx.fileReader.mzAPI.mzML
lx.fileReader.mzAPI.mzML.make_info_file(data_file)
......@@ -352,7 +351,7 @@ class mzFile(object):
"""
if data_file.lower().endswith('.lnk'):
data_file = follow_link(data_file)
data_file = self.follow_link(data_file)
if data_file.lower().startswith('http://'):
import lx.fileReader.mzAPI.mzURL
......@@ -430,16 +429,17 @@ class mzFile(object):
raise NotImplementedError('Subclasses must implement this method')
def scan(self, time):
"""Gets scan based on the specified scan time
def scan(self, scan_id, time):
"""Gets scan based on the specified scan id (id attribute in mzML) or the time
The id will be preferred over the time, and is used as a lookup key in the scan cache.
The scan is a list of (mz, intensity, resolution, baseline,
noise, charge) tuples. Actually only recent versions of the raw
file format returns all of those. Normally only mz and
intensity are filled, the others set to zero.
Example:
>>> scan = myPeakFile.scan(20.035)
>>> scan = myPeakFile.scan("controllerType=0 controllerNumber=1 scan=3161", 23.21)
"""
......
......@@ -24,6 +24,7 @@
#MRM_Q3MS_TEXT As String = "Q3MS "
#MRM_SRM_TEXT As String = "SRM ms2"
#MRM_FullNL_TEXT As String = "Full cnl " ' MRM neutral loss
import mmap
from lx.exceptions import LipidXException
......@@ -130,9 +131,8 @@ def make_info_file(data_file):
m = mzFile(data_file)
m._build_info_scans()
fh = open(data_file + '.mzi', 'wb')
cPickle.dump(m._info_scans, fh)
fh.close()
with open(data_file + '.mzi', 'wb') as fh:
cPickle.dump(m._info_scans, fh)
class mzFile(mzAPImzFile):
......@@ -147,7 +147,6 @@ class mzFile(mzAPImzFile):
use, there's the _build_info_scans method, but in general it makes
more sense to create an .mzi file.
"""
_xp_time = etree.XPath(('./mz:scanList/mz:scan/'
'mz:cvParam[@name="scan start time"]/'
'attribute::value'),
......@@ -178,6 +177,7 @@ class mzFile(mzAPImzFile):
namespaces=NSd, smart_strings=False)
_xp_tic = etree.XPath('./mz:cvParam[@name="total ion current"]/attribute::value',
namespaces=NSd, smart_strings=False)
scan_map = {}
def __init__(self, data_file, **kwargs):
self.file_type = 'mzml'
......@@ -186,18 +186,25 @@ class mzFile(mzAPImzFile):
if data_file.lower().endswith('.mzml.gz'):
self.fileobj = gzip.GzipFile(data_file, mode='rb')
else:
self.fileobj = open(data_file, mode='rb')
with open(data_file, "r+b") as f:
self.mmap_fileobj = f
self.fileobj = mmap.mmap(f.fileno(), 0)
if os.path.exists(data_file + '.mzi'):
self._info_file = data_file + '.mzi'
info_fh = open(self._info_file)
self._info_scans = cPickle.load(info_fh)
info_fh.close()
with open(self._info_file) as info_fh:
self._info_scans = cPickle.load(info_fh)
else:
self._info_file = None
self._info_scans = None
def close(self):
self.fileobj.close()
if self.mmap_fileobj is not None:
self.mmap_fileobj.close()
self._info_file = None
self._info_scans = None
self.scan_map = {}
def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999):
if start_time is None:
......@@ -207,7 +214,7 @@ class mzFile(mzAPImzFile):
if self._info_file:
return [ (i['time'], i['mz']) for i in self._info_scans
if (start_time <= i['time'] and ((not stop_time) or i['time'] <= stop_time))
and (i['scan_mode'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ]
and (i['scan_type'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ]
scan_list = []
......@@ -263,6 +270,10 @@ class mzFile(mzAPImzFile):
self.fileobj.seek(0)
context = etree.iterparse(self.fileobj, events=('end',),
tag='%sspectrum' % NS)
n_ms1 = 0
n_ms2 = 0
n_ms1_filtered = 0
n_ms2_filtered = 0
for event, elem in context:
xt = self._xp_time(elem)
if xt:
......@@ -311,7 +322,7 @@ class mzFile(mzAPImzFile):
elif self._xp_frg_pis(elem):
p = float(self._xp_frg_pis(elem)[0])
else:
print "this ms2 didn't have a precursor m/z or a fragment scan m/z...", elem.get("id")
print "Skipping scan: '%s' ; Could not find a precursor or fragment!" % (elem.get('id'))
else:
p = precursor
......@@ -325,6 +336,20 @@ class mzFile(mzAPImzFile):
scan_name = elem.get('id')
scan_info.append((time, mz, scan_name, 'MS2', scan_mode, polarity, total_ic, 0, 0))
# caching here changes the results :-/
# load mz and intensity arrays proactively
mz, it = zip(*self._scan_from_spec_node(elem, xt))
empty = [0 for i in range(len(mz))]
self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty)
n_ms2 = n_ms2 + 1
else:
#print "Skipping scan: '%s' ; m/z='%f' is out of defined range!"%(elem.get('id'), mz)
n_ms2_filtered = n_ms2_filtered + 1
else:
#print "Skipping scan: '%s' ; Could not find a precursor!"%(elem.get('id'))
n_ms2_filtered = n_ms2_filtered + 1
else:
if self._xp_prof(elem):
scan_mode = 'p'
......@@ -333,10 +358,23 @@ class mzFile(mzAPImzFile):
scan_name = elem.get('id')
scan_info.append((time, 0.0, scan_name, 'MS1', scan_mode, polarity, total_ic, 0, 0))
# caching here changes the results :-/
# load mz and intensity arrays proactively
mz, it = zip(*self._scan_from_spec_node(elem, xt))
empty = [0 for i in range(len(mz))]
self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty)
n_ms1 = n_ms1 + 1
else:
n_ms1_filtered = n_ms1_filtered + 1
else:
print "this spectrum didn't have a scan time...", elem.get("id")
n_ms1_filtered = n_ms1_filtered + 1
elem.clear()
#total_scans = n_ms1+n_ms1_filtered+n_ms2+n_ms2_filtered
#print "Loaded %d of %d scan info objects, %d MS1 scans, %d MS2 scans, filtered %d MS1 scans and %d MS2 scans."%(len(scan_info), total_scans, n_ms1, n_ms2, n_ms1_filtered, n_ms2_filtered)
return scan_info
def scan_time_from_scan_name(self, scan_name):
......@@ -361,10 +399,13 @@ class mzFile(mzAPImzFile):
else:
print "scan not found:", scan_name
def scan(self, time):
def scan(self, scan_id, time):
# this implementation takes a few seconds and uses very little memory
# (by keeping only the current closest scan)
if self.scan_map.has_key(scan_id):
return self.scan_map[scan_id]
if self._info_file:
closest_item = self._info_scans.closest(key='time', value=time)
spec_start, spec_size = closest_item['offset'], closest_item['size']
......@@ -374,7 +415,6 @@ class mzFile(mzAPImzFile):
mz, it = zip(*self._scan_from_spec_node(spec, closest_item['time'], prefix=False))
empty = [0 for i in range(len(mz))]
return zip(list(mz), list(it), empty, empty, empty, empty)
#return self._scan_from_spec_node(spec, closest_item['time'], prefix=False)
self.fileobj.seek(0)
context = etree.iterparse(self.fileobj, events=('end',),
......@@ -600,7 +640,7 @@ class mzFileInMemory:
# now there's nothing to do, because there's no handle on the mzML file
#pass
def scan_list(self, start_timeNone, stop_time=None, start_mz=0, stop_mz=99999):
def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999):
if start_time is None or stop_time is None:
(file_start_time, file_stop_time) = self.time_range()
if start_time is None:
......
......@@ -113,7 +113,7 @@ def add_Sample(
else:
polarity = pol
scan = mz_file.scan(t)
scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans
if len(scan) == 0:
......@@ -159,7 +159,7 @@ def add_Sample(
else:
polarity = pol
scan = mz_file.scan(t)
scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans
if len(scan) == 0:
......@@ -571,7 +571,7 @@ def add_Sample_AVG(
else:
polarity = pol
scan = mz_file.scan(t)
scan = mz_file.scan(scan_id=sn, time=t)
# don't consider empty scans
if len(scan) == 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment