From 1d9ad0dcaff1dc2f0d3878a43d4a78c07fd0d0b7 Mon Sep 17 00:00:00 2001 From: mirandaa Date: Wed, 25 Sep 2019 15:04:12 +0200 Subject: [PATCH] nils file reader improvement --- lx/fileReader/mzAPI/__init__.py | 14 +++---- lx/fileReader/mzAPI/mzML.py | 66 ++++++++++++++++++++++++++------- lx/readSpectra.py | 6 +-- 3 files changed, 63 insertions(+), 23 deletions(-) diff --git a/lx/fileReader/mzAPI/__init__.py b/lx/fileReader/mzAPI/__init__.py index 06d0ed7..de3a47a 100644 --- a/lx/fileReader/mzAPI/__init__.py +++ b/lx/fileReader/mzAPI/__init__.py @@ -249,9 +249,8 @@ def make_info_file(data_file, **kwargs): #Pickle object - fh = open(data_file + '.mzi', 'w') - cPickle.dump(info_list, fh) - fh.close() + with open(data_file + '.mzi', 'w') as fh: + cPickle.dump(info_list, fh) else: import lx.fileReader.mzAPI.mzML lx.fileReader.mzAPI.mzML.make_info_file(data_file) @@ -352,7 +351,7 @@ class mzFile(object): """ if data_file.lower().endswith('.lnk'): - data_file = follow_link(data_file) + data_file = self.follow_link(data_file) if data_file.lower().startswith('http://'): import lx.fileReader.mzAPI.mzURL @@ -430,16 +429,17 @@ class mzFile(object): raise NotImplementedError('Subclasses must implement this method') - def scan(self, time): - """Gets scan based on the specified scan time + def scan(self, scan_id, time): + """Gets scan based on the specified scan id (id attribute in mzML) or the time + The id will be preferred over the time, and is used as a lookup key in the scan cache. The scan is a list of (mz, intensity, resolution, baseline, noise, charge) tuples. Actually only recent versions of the raw file format returns all of those. Normally only mz and intensity are filled, the others set to zero. Example: - >>> scan = myPeakFile.scan(20.035) + >>> scan = myPeakFile.scan("controllerType=0 controllerNumber=1 scan=3161", 23.21) """ diff --git a/lx/fileReader/mzAPI/mzML.py b/lx/fileReader/mzAPI/mzML.py index 1db7cc3..238adb5 100644 --- a/lx/fileReader/mzAPI/mzML.py +++ b/lx/fileReader/mzAPI/mzML.py @@ -24,6 +24,7 @@ #MRM_Q3MS_TEXT As String = "Q3MS " #MRM_SRM_TEXT As String = "SRM ms2" #MRM_FullNL_TEXT As String = "Full cnl " ' MRM neutral loss +import mmap from lx.exceptions import LipidXException @@ -130,9 +131,8 @@ def make_info_file(data_file): m = mzFile(data_file) m._build_info_scans() - fh = open(data_file + '.mzi', 'wb') - cPickle.dump(m._info_scans, fh) - fh.close() + with open(data_file + '.mzi', 'wb') as fh: + cPickle.dump(m._info_scans, fh) class mzFile(mzAPImzFile): @@ -147,7 +147,6 @@ class mzFile(mzAPImzFile): use, there's the _build_info_scans method, but in general it makes more sense to create an .mzi file. """ - _xp_time = etree.XPath(('./mz:scanList/mz:scan/' 'mz:cvParam[@name="scan start time"]/' 'attribute::value'), @@ -178,6 +177,7 @@ class mzFile(mzAPImzFile): namespaces=NSd, smart_strings=False) _xp_tic = etree.XPath('./mz:cvParam[@name="total ion current"]/attribute::value', namespaces=NSd, smart_strings=False) + scan_map = {} def __init__(self, data_file, **kwargs): self.file_type = 'mzml' @@ -186,18 +186,25 @@ class mzFile(mzAPImzFile): if data_file.lower().endswith('.mzml.gz'): self.fileobj = gzip.GzipFile(data_file, mode='rb') else: - self.fileobj = open(data_file, mode='rb') + with open(data_file, "r+b") as f: + self.mmap_fileobj = f + self.fileobj = mmap.mmap(f.fileno(), 0) if os.path.exists(data_file + '.mzi'): self._info_file = data_file + '.mzi' - info_fh = open(self._info_file) - self._info_scans = cPickle.load(info_fh) - info_fh.close() + with open(self._info_file) as info_fh: + self._info_scans = cPickle.load(info_fh) else: self._info_file = None + self._info_scans = None def close(self): self.fileobj.close() + if self.mmap_fileobj is not None: + self.mmap_fileobj.close() + self._info_file = None + self._info_scans = None + self.scan_map = {} def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999): if start_time is None: @@ -207,7 +214,7 @@ class mzFile(mzAPImzFile): if self._info_file: return [ (i['time'], i['mz']) for i in self._info_scans if (start_time <= i['time'] and ((not stop_time) or i['time'] <= stop_time)) - and (i['scan_mode'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ] + and (i['scan_type'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ] scan_list = [] @@ -263,6 +270,10 @@ class mzFile(mzAPImzFile): self.fileobj.seek(0) context = etree.iterparse(self.fileobj, events=('end',), tag='%sspectrum' % NS) + n_ms1 = 0 + n_ms2 = 0 + n_ms1_filtered = 0 + n_ms2_filtered = 0 for event, elem in context: xt = self._xp_time(elem) if xt: @@ -311,7 +322,7 @@ class mzFile(mzAPImzFile): elif self._xp_frg_pis(elem): p = float(self._xp_frg_pis(elem)[0]) else: - print "this ms2 didn't have a precursor m/z or a fragment scan m/z...", elem.get("id") + print "Skipping scan: '%s' ; Could not find a precursor or fragment!" % (elem.get('id')) else: p = precursor @@ -325,6 +336,20 @@ class mzFile(mzAPImzFile): scan_name = elem.get('id') scan_info.append((time, mz, scan_name, 'MS2', scan_mode, polarity, total_ic, 0, 0)) + + # caching here changes the results :-/ + # load mz and intensity arrays proactively + mz, it = zip(*self._scan_from_spec_node(elem, xt)) + empty = [0 for i in range(len(mz))] + self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty) + + n_ms2 = n_ms2 + 1 + else: + #print "Skipping scan: '%s' ; m/z='%f' is out of defined range!"%(elem.get('id'), mz) + n_ms2_filtered = n_ms2_filtered + 1 + else: + #print "Skipping scan: '%s' ; Could not find a precursor!"%(elem.get('id')) + n_ms2_filtered = n_ms2_filtered + 1 else: if self._xp_prof(elem): scan_mode = 'p' @@ -333,10 +358,23 @@ class mzFile(mzAPImzFile): scan_name = elem.get('id') scan_info.append((time, 0.0, scan_name, 'MS1', scan_mode, polarity, total_ic, 0, 0)) + + # caching here changes the results :-/ + # load mz and intensity arrays proactively + mz, it = zip(*self._scan_from_spec_node(elem, xt)) + empty = [0 for i in range(len(mz))] + self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty) + + n_ms1 = n_ms1 + 1 + else: + n_ms1_filtered = n_ms1_filtered + 1 else: print "this spectrum didn't have a scan time...", elem.get("id") + n_ms1_filtered = n_ms1_filtered + 1 elem.clear() + #total_scans = n_ms1+n_ms1_filtered+n_ms2+n_ms2_filtered + #print "Loaded %d of %d scan info objects, %d MS1 scans, %d MS2 scans, filtered %d MS1 scans and %d MS2 scans."%(len(scan_info), total_scans, n_ms1, n_ms2, n_ms1_filtered, n_ms2_filtered) return scan_info def scan_time_from_scan_name(self, scan_name): @@ -361,10 +399,13 @@ class mzFile(mzAPImzFile): else: print "scan not found:", scan_name - def scan(self, time): + def scan(self, scan_id, time): # this implementation takes a few seconds and uses very little memory # (by keeping only the current closest scan) + if self.scan_map.has_key(scan_id): + return self.scan_map[scan_id] + if self._info_file: closest_item = self._info_scans.closest(key='time', value=time) spec_start, spec_size = closest_item['offset'], closest_item['size'] @@ -374,7 +415,6 @@ class mzFile(mzAPImzFile): mz, it = zip(*self._scan_from_spec_node(spec, closest_item['time'], prefix=False)) empty = [0 for i in range(len(mz))] return zip(list(mz), list(it), empty, empty, empty, empty) - #return self._scan_from_spec_node(spec, closest_item['time'], prefix=False) self.fileobj.seek(0) context = etree.iterparse(self.fileobj, events=('end',), @@ -600,7 +640,7 @@ class mzFileInMemory: # now there's nothing to do, because there's no handle on the mzML file #pass - def scan_list(self, start_timeNone, stop_time=None, start_mz=0, stop_mz=99999): + def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999): if start_time is None or stop_time is None: (file_start_time, file_stop_time) = self.time_range() if start_time is None: diff --git a/lx/readSpectra.py b/lx/readSpectra.py index 9db433d..30ac26f 100644 --- a/lx/readSpectra.py +++ b/lx/readSpectra.py @@ -113,7 +113,7 @@ def add_Sample( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: @@ -159,7 +159,7 @@ def add_Sample( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: @@ -571,7 +571,7 @@ def add_Sample_AVG( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: -- GitLab