diff --git a/lx/fileReader/mzAPI/__init__.py b/lx/fileReader/mzAPI/__init__.py index 06d0ed788fa6ad07e4b54308a5d1ef58fc74c4da..de3a47a23881611febb38dc0ac4f3d3703dee3b3 100644 --- a/lx/fileReader/mzAPI/__init__.py +++ b/lx/fileReader/mzAPI/__init__.py @@ -249,9 +249,8 @@ def make_info_file(data_file, **kwargs): #Pickle object - fh = open(data_file + '.mzi', 'w') - cPickle.dump(info_list, fh) - fh.close() + with open(data_file + '.mzi', 'w') as fh: + cPickle.dump(info_list, fh) else: import lx.fileReader.mzAPI.mzML lx.fileReader.mzAPI.mzML.make_info_file(data_file) @@ -352,7 +351,7 @@ class mzFile(object): """ if data_file.lower().endswith('.lnk'): - data_file = follow_link(data_file) + data_file = self.follow_link(data_file) if data_file.lower().startswith('http://'): import lx.fileReader.mzAPI.mzURL @@ -430,16 +429,17 @@ class mzFile(object): raise NotImplementedError('Subclasses must implement this method') - def scan(self, time): - """Gets scan based on the specified scan time + def scan(self, scan_id, time): + """Gets scan based on the specified scan id (id attribute in mzML) or the time + The id will be preferred over the time, and is used as a lookup key in the scan cache. The scan is a list of (mz, intensity, resolution, baseline, noise, charge) tuples. Actually only recent versions of the raw file format returns all of those. Normally only mz and intensity are filled, the others set to zero. Example: - >>> scan = myPeakFile.scan(20.035) + >>> scan = myPeakFile.scan("controllerType=0 controllerNumber=1 scan=3161", 23.21) """ diff --git a/lx/fileReader/mzAPI/mzML.py b/lx/fileReader/mzAPI/mzML.py index 1db7cc3515f3a8c10c954623545ad5619dc0d7d0..238adb52bc0f625d0a668ddeb00ecda9c7343b33 100644 --- a/lx/fileReader/mzAPI/mzML.py +++ b/lx/fileReader/mzAPI/mzML.py @@ -24,6 +24,7 @@ #MRM_Q3MS_TEXT As String = "Q3MS " #MRM_SRM_TEXT As String = "SRM ms2" #MRM_FullNL_TEXT As String = "Full cnl " ' MRM neutral loss +import mmap from lx.exceptions import LipidXException @@ -130,9 +131,8 @@ def make_info_file(data_file): m = mzFile(data_file) m._build_info_scans() - fh = open(data_file + '.mzi', 'wb') - cPickle.dump(m._info_scans, fh) - fh.close() + with open(data_file + '.mzi', 'wb') as fh: + cPickle.dump(m._info_scans, fh) class mzFile(mzAPImzFile): @@ -147,7 +147,6 @@ class mzFile(mzAPImzFile): use, there's the _build_info_scans method, but in general it makes more sense to create an .mzi file. """ - _xp_time = etree.XPath(('./mz:scanList/mz:scan/' 'mz:cvParam[@name="scan start time"]/' 'attribute::value'), @@ -178,6 +177,7 @@ class mzFile(mzAPImzFile): namespaces=NSd, smart_strings=False) _xp_tic = etree.XPath('./mz:cvParam[@name="total ion current"]/attribute::value', namespaces=NSd, smart_strings=False) + scan_map = {} def __init__(self, data_file, **kwargs): self.file_type = 'mzml' @@ -186,18 +186,25 @@ class mzFile(mzAPImzFile): if data_file.lower().endswith('.mzml.gz'): self.fileobj = gzip.GzipFile(data_file, mode='rb') else: - self.fileobj = open(data_file, mode='rb') + with open(data_file, "r+b") as f: + self.mmap_fileobj = f + self.fileobj = mmap.mmap(f.fileno(), 0) if os.path.exists(data_file + '.mzi'): self._info_file = data_file + '.mzi' - info_fh = open(self._info_file) - self._info_scans = cPickle.load(info_fh) - info_fh.close() + with open(self._info_file) as info_fh: + self._info_scans = cPickle.load(info_fh) else: self._info_file = None + self._info_scans = None def close(self): self.fileobj.close() + if self.mmap_fileobj is not None: + self.mmap_fileobj.close() + self._info_file = None + self._info_scans = None + self.scan_map = {} def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999): if start_time is None: @@ -207,7 +214,7 @@ class mzFile(mzAPImzFile): if self._info_file: return [ (i['time'], i['mz']) for i in self._info_scans if (start_time <= i['time'] and ((not stop_time) or i['time'] <= stop_time)) - and (i['scan_mode'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ] + and (i['scan_type'] == 'MS1' or start_mz <= i['mz'] <= stop_mz) ] scan_list = [] @@ -263,6 +270,10 @@ class mzFile(mzAPImzFile): self.fileobj.seek(0) context = etree.iterparse(self.fileobj, events=('end',), tag='%sspectrum' % NS) + n_ms1 = 0 + n_ms2 = 0 + n_ms1_filtered = 0 + n_ms2_filtered = 0 for event, elem in context: xt = self._xp_time(elem) if xt: @@ -311,7 +322,7 @@ class mzFile(mzAPImzFile): elif self._xp_frg_pis(elem): p = float(self._xp_frg_pis(elem)[0]) else: - print "this ms2 didn't have a precursor m/z or a fragment scan m/z...", elem.get("id") + print "Skipping scan: '%s' ; Could not find a precursor or fragment!" % (elem.get('id')) else: p = precursor @@ -325,6 +336,20 @@ class mzFile(mzAPImzFile): scan_name = elem.get('id') scan_info.append((time, mz, scan_name, 'MS2', scan_mode, polarity, total_ic, 0, 0)) + + # caching here changes the results :-/ + # load mz and intensity arrays proactively + mz, it = zip(*self._scan_from_spec_node(elem, xt)) + empty = [0 for i in range(len(mz))] + self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty) + + n_ms2 = n_ms2 + 1 + else: + #print "Skipping scan: '%s' ; m/z='%f' is out of defined range!"%(elem.get('id'), mz) + n_ms2_filtered = n_ms2_filtered + 1 + else: + #print "Skipping scan: '%s' ; Could not find a precursor!"%(elem.get('id')) + n_ms2_filtered = n_ms2_filtered + 1 else: if self._xp_prof(elem): scan_mode = 'p' @@ -333,10 +358,23 @@ class mzFile(mzAPImzFile): scan_name = elem.get('id') scan_info.append((time, 0.0, scan_name, 'MS1', scan_mode, polarity, total_ic, 0, 0)) + + # caching here changes the results :-/ + # load mz and intensity arrays proactively + mz, it = zip(*self._scan_from_spec_node(elem, xt)) + empty = [0 for i in range(len(mz))] + self.scan_map[scan_name] = zip(list(mz), list(it), empty, empty, empty, empty) + + n_ms1 = n_ms1 + 1 + else: + n_ms1_filtered = n_ms1_filtered + 1 else: print "this spectrum didn't have a scan time...", elem.get("id") + n_ms1_filtered = n_ms1_filtered + 1 elem.clear() + #total_scans = n_ms1+n_ms1_filtered+n_ms2+n_ms2_filtered + #print "Loaded %d of %d scan info objects, %d MS1 scans, %d MS2 scans, filtered %d MS1 scans and %d MS2 scans."%(len(scan_info), total_scans, n_ms1, n_ms2, n_ms1_filtered, n_ms2_filtered) return scan_info def scan_time_from_scan_name(self, scan_name): @@ -361,10 +399,13 @@ class mzFile(mzAPImzFile): else: print "scan not found:", scan_name - def scan(self, time): + def scan(self, scan_id, time): # this implementation takes a few seconds and uses very little memory # (by keeping only the current closest scan) + if self.scan_map.has_key(scan_id): + return self.scan_map[scan_id] + if self._info_file: closest_item = self._info_scans.closest(key='time', value=time) spec_start, spec_size = closest_item['offset'], closest_item['size'] @@ -374,7 +415,6 @@ class mzFile(mzAPImzFile): mz, it = zip(*self._scan_from_spec_node(spec, closest_item['time'], prefix=False)) empty = [0 for i in range(len(mz))] return zip(list(mz), list(it), empty, empty, empty, empty) - #return self._scan_from_spec_node(spec, closest_item['time'], prefix=False) self.fileobj.seek(0) context = etree.iterparse(self.fileobj, events=('end',), @@ -600,7 +640,7 @@ class mzFileInMemory: # now there's nothing to do, because there's no handle on the mzML file #pass - def scan_list(self, start_timeNone, stop_time=None, start_mz=0, stop_mz=99999): + def scan_list(self, start_time=None, stop_time=None, start_mz=0, stop_mz=99999): if start_time is None or stop_time is None: (file_start_time, file_stop_time) = self.time_range() if start_time is None: diff --git a/lx/readSpectra.py b/lx/readSpectra.py index 9db433d16a15e33d6c9dd36ac04a55459805e20f..30ac26f71c492366fef5c14cb886d80226f4f978 100644 --- a/lx/readSpectra.py +++ b/lx/readSpectra.py @@ -113,7 +113,7 @@ def add_Sample( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: @@ -159,7 +159,7 @@ def add_Sample( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: @@ -571,7 +571,7 @@ def add_Sample_AVG( else: polarity = pol - scan = mz_file.scan(t) + scan = mz_file.scan(scan_id=sn, time=t) # don't consider empty scans if len(scan) == 0: