Commit b0baa719 authored by rpeckner-broad's avatar rpeckner-broad Committed by GitHub
Browse files

Add files via upload

parent e9eb40ef
import sys
import time
import pandas as pd
import pymzml
import numpy as np
from numpy import linalg
from scipy import stats
from scipy import integrate
from scipy import sparse
from scipy.optimize import fsolve
from scipy.stats import mvn
import os
import random
import itertools
import cPickle as pickle
from functools import partial
import sparse_nnls
from pyspark import SparkConf,SparkContext
import sqlite3
import struct
import zlib
import csv
import re
def RegressSpectraOntoLibrary(DIASpectraIterator,Library,tol):
RefSpectraLibrary = Library.value
for DIASpectrum in DIASpectraIterator:
precMZ = float(DIASpectrum[1])
precRT = float(DIASpectrum[2]) #MS2 scan retention time, in minutes
index = DIASpectrum[3]
windowWidth = DIASpectrum[4]
DIASpectrum = np.array(DIASpectrum[0])
LibraryCoeffs = []
if len(DIASpectrum.shape) == 2:
if windowWidth > 0:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth]
else:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ]
#MERGING OF POINTS IN ACQUIRED SPECTRUM WITH NEARBY M/Z COORDINATES
MergedDIASpecCoordIndices = np.searchsorted(DIASpectrum[:,0]+tol*DIASpectrum[:,0],DIASpectrum[:,0])
MergedDIASpecCoords = DIASpectrum[np.unique(MergedDIASpecCoordIndices),0]
MergedDIASpecIntensities = [np.mean(DIASpectrum[np.where(MergedDIASpecCoordIndices == i)[0],1]) for i in np.unique(MergedDIASpecCoordIndices)]
DIASpectrum = np.array((MergedDIASpecCoords,MergedDIASpecIntensities)).transpose()
#FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM
CentroidBreaks = np.concatenate((DIASpectrum[:,0]-tol*DIASpectrum[:,0],DIASpectrum[:,0]+tol*DIASpectrum[:,0]))
CentroidBreaks = np.sort(CentroidBreaks)
LocateReferenceCoordsInDIA = [np.searchsorted(CentroidBreaks,M[:,0]) for M in CandidateRefSpectraLibrary]
TopTenPeaksCoordsInDIA = [np.searchsorted(CentroidBreaks,M[np.argsort(-M[:,1])[0:min(10,M.shape[0])],0]) for M in CandidateRefSpectraLibrary]
ReferencePeaksInDIA = [i for i in range(len(MassWindowCandidates)) if
len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5] #min(3,CandidateRefSpectraLibrary[i].shape[0])]
ProportionOfReferencePeaksInDIA = [len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1])/CandidateRefSpectraLibrary[i].shape[0]
for i in range(len(MassWindowCandidates))]
RefPeptideCandidatesLocations = [LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA]
RefPeptideCandidateList = [CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA]
RefPeptideCandidates = [MassWindowCandidates[i] for i in ReferencePeaksInDIA]
NormalizedRefPeptideCandidateList = [M[:,1]/sum(M[:,1]) for M in RefPeptideCandidateList]
RefSpectraLibrarySparseRowIndices = (np.array([i for v in RefPeptideCandidatesLocations for i in v if i % 2 == 1]) + 1)/2
RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1 #Respect the 0-indexing
RefSpectraLibrarySparseColumnIndices = np.array([i for j in range(len(RefPeptideCandidates)) for i in [j]*len([k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1])])
RefSpectraLibrarySparseMatrixEntries = np.array([NormalizedRefPeptideCandidateList[k][i] for k in range(len(NormalizedRefPeptideCandidateList)) for i in range(len(NormalizedRefPeptideCandidateList[k]))
if RefPeptideCandidatesLocations[k][i] % 2 == 1])
if (len(RefSpectraLibrarySparseRowIndices) > 0 and len(RefSpectraLibrarySparseColumnIndices) > 0 and len(RefSpectraLibrarySparseMatrixEntries) > 0):
UniqueRowIndices = [i for i in set(RefSpectraLibrarySparseRowIndices)]
UniqueRowIndices.sort()
DIASpectrumIntensities=DIASpectrum[UniqueRowIndices,1] #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry
DIASpectrumIntensities=np.append(DIASpectrumIntensities,[0]) #Add a zero to the end of the DIA data vector to penalize
#peaks of library spectra not present in the DIA spectrum
#AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM
ReferencePeaksNotInDIA = np.array([k for v in RefPeptideCandidatesLocations for k in range(len(v)) if v[k] % 2 == 0])
SparseColumnIndicesForPeaksNotInDIA = np.arange(len(RefPeptideCandidates))
NumRowsOfLibraryMatrix = max(UniqueRowIndices)
SparseRowIndicesForPeaksNotInDIA = [NumRowsOfLibraryMatrix+1]*len(SparseColumnIndicesForPeaksNotInDIA)
#Duplicate (i,j) entries are summed together, yielding total ion intensities
SparseMatrixEntriesForPeaksNotInDIA = np.array([np.sum([NormalizedRefPeptideCandidateList[j][k]
for k in range(len(NormalizedRefPeptideCandidateList[j]))
if RefPeptideCandidatesLocations[j][k] % 2 == 0])
for j in range(len(NormalizedRefPeptideCandidateList))])
SparseRowIndices=np.append(RefSpectraLibrarySparseRowIndices,SparseRowIndicesForPeaksNotInDIA)
SparseColumnIndices=np.append(RefSpectraLibrarySparseColumnIndices,SparseColumnIndicesForPeaksNotInDIA)
SparseMatrixEntries=np.append(RefSpectraLibrarySparseMatrixEntries,SparseMatrixEntriesForPeaksNotInDIA)
SparseRowIndices = stats.rankdata(SparseRowIndices,method='dense').astype(int) - 1 #Renumber the row indices according to the projected spectrum,
#respecting the 0-indexing
LibrarySparseMatrix = sparse.coo_matrix((SparseMatrixEntries,(SparseRowIndices,SparseColumnIndices)))
LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix,DIASpectrumIntensities,{'show_progress': False})
LibraryCoeffs = LibraryCoeffs['x']
NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0]
NonzeroCoeffsAboveThreshold = NonzeroCoeffs
Output = [[0,index,0,0,0,0,tol]]
if len(NonzeroCoeffs) > 0:
RefSpectraIDs = [RefPeptideCandidates[j] for j in range(len(RefPeptideCandidates)) if LibraryCoeffs[j] != 0]
Output = [[NonzeroCoeffsAboveThreshold[i],index,RefSpectraIDs[i][0],RefSpectraIDs[i][1],precMZ,precRT,RefSpectraPeakProportions[i]] for i in range(len(NonzeroCoeffsAboveThreshold))]
yield Output
if __name__ == "__main__":
args = sys.argv
mzMLname = args[1] #dirName = args[1]
libName = args[2]
Index = 0
if len(args) >= 4:
Index = int(args[3])
StartOrEnd = "start"
if len(args) >= 5:
StartOrEnd = args[4]
numPartitions = 200
if len(args) >= 6:
numPartitions = int(args[5])
instrument = 'orbitrap'
if len(args) == 7:
instrument=args[6]
delta = 1e-5
if len(args) == 8:
delta=float(args[7])
#Cast the spectral library as a dictionary
start = time.time()
libPath = os.path.expanduser(libName+'.blib')
if os.path.exists(libPath):
Lib = sqlite3.connect(libPath)
LibPrecursorInfo = pd.read_sql("SELECT * FROM RefSpectra",Lib)
SpectraLibrary = {}
for i in range(len(LibPrecursorInfo)):
precursorID = str(LibPrecursorInfo['id'][i])
precursorKey = (LibPrecursorInfo['peptideModSeq'][i],LibPrecursorInfo['precursorCharge'][i])
NumPeaks = pd.read_sql("SELECT numPeaks FROM RefSpectra WHERE id = "+precursorID,Lib)['numPeaks'][0]
SpectrumMZ = pd.read_sql("SELECT peakMZ FROM RefSpectraPeaks WHERE RefSpectraID = " + precursorID,Lib)['peakMZ'][0]
SpectrumIntensities = pd.read_sql("SELECT peakIntensity FROM RefSpectraPeaks WHERE RefSpectraID = "+precursorID,Lib)['peakIntensity'][0]
if len(SpectrumMZ) == 8*NumPeaks and len(SpectrumIntensities) == 4*NumPeaks:
SpectraLibrary.setdefault(precursorKey,{})
SpectrumMZ = struct.unpack('d'*NumPeaks,SpectrumMZ)
SpectrumIntensities = struct.unpack('f'*NumPeaks,SpectrumIntensities)
SpectraLibrary[precursorKey]['Spectrum'] = np.array((SpectrumMZ,SpectrumIntensities)).T
SpectraLibrary[precursorKey]['PrecursorMZ'] = LibPrecursorInfo['precursorMZ'][i]
SpectraLibrary[precursorKey]['PrecursorRT'] = LibPrecursorInfo['retentionTime'][i] #The library retention time is given in minutes
elif len(SpectrumIntensities) == 4*NumPeaks:
SpectraLibrary.setdefault(precursorKey,{})
SpectrumMZ = struct.unpack('d'*NumPeaks,zlib.decompress(SpectrumMZ))
SpectrumIntensities = struct.unpack('f'*NumPeaks,SpectrumIntensities)
SpectraLibrary[precursorKey]['Spectrum'] = np.array((SpectrumMZ,SpectrumIntensities)).T
SpectraLibrary[precursorKey]['PrecursorMZ'] = LibPrecursorInfo['precursorMZ'][i]
SpectraLibrary[precursorKey]['PrecursorRT'] = LibPrecursorInfo['retentionTime'][i]
elif len(SpectrumMZ) == 8*NumPeaks:
SpectraLibrary.setdefault(precursorKey,{})
SpectrumMZ = struct.unpack('d'*NumPeaks,SpectrumMZ)
SpectrumIntensities = struct.unpack('f'*NumPeaks,zlib.decompress(SpectrumIntensities))
SpectraLibrary[precursorKey]['Spectrum'] = np.array((SpectrumMZ,SpectrumIntensities)).T
SpectraLibrary[precursorKey]['PrecursorMZ'] = LibPrecursorInfo['precursorMZ'][i]
SpectraLibrary[precursorKey]['PrecursorRT'] = LibPrecursorInfo['retentionTime'][i]
#print i
else:
SpectraLibrary = pickle.load(open(libName,"rb"))
print "Library loaded in {} minutes".format(round((time.time()-start)/60,1))
path = os.path.expanduser(mzMLname+'.mzML')
DIArun = pymzml.run.Reader(path)
E = enumerate(DIArun)
start = time.time()
if StartOrEnd == "start":
if instrument == 'orbitrap':
res = [[spectrum.peaks,spectrum['precursors'][0]['mz'],spectrum['scan time'],i] for i,spectrum in E if spectrum['ms level'] == 2.0 and i < Index]
if instrument == 'tof':
res = [[spectrum.peaks,spectrum['precursors'][0]['mz'],spectrum['scan time'],i] for i,spectrum in E if 'precursors' in spectrum.keys() and i < Index]
else:
if instrument == 'orbitrap':
res = [[spectrum.peaks,spectrum['precursors'][0]['mz'],spectrum['scan time'],i] for i,spectrum in E if spectrum['ms level'] == 2.0 and i >= Index]
if instrument == 'tof':
res = [[spectrum.peaks,spectrum['precursors'][0]['mz'],spectrum['scan time'],i] for i,spectrum in E if 'precursors' in spectrum.keys() and i >= Index]
print "Loaded {} MS2 spectra from {} in {} minutes.".format(len(res),path,round((time.time()-start)/60,1))
res=[[res[i][0],res[i][1],res[i][2],res[i][3],float(res[i+1][1]) - float(res[i][1])] for i in range(len(res)-1)]
MaxWindowPrecMZ = max([x[1] for x in res]) + max([x[4] for x in res])
SpectraLibrary = {k:SpectraLibrary[k] for k in SpectraLibrary if SpectraLibrary[k]['PrecursorMZ'] < MaxWindowPrecMZ}
conf = (SparkConf().set("spark.driver.maxResultSize", "25g"))
sc = SparkContext(conf=conf,appName="Specter",pyFiles=['sparse_nnls.py'])
#Recast the library as a broadcast variable to improve performance
BroadcastLibrary = sc.broadcast(SpectraLibrary)
res = sc.parallelize(res, numPartitions)
output = res.mapPartitions(partial(RegressSpectraOntoLibrary, Library=BroadcastLibrary, tol=delta*1e-6)).collect()
output = [[output[i][j][0],output[i][j][1],output[i][j][2],output[i][j][3],
output[i][j][4],output[i][j][5],output[i][j][6]] for i in range(len(output)) for j in range(len(output[i]))]
absolutePath = mzMLname.rsplit('/',1)[0]
noPathName = mzMLname.rsplit('/',1)[1]
if '/' in libName:
libName = libName.rsplit('/',1)[1]
outputPath = os.path.expanduser(absolutePath+'/SpecterResults/'+noPathName+'_SpecterCoeffs_'+libName+'.csv')
with open(outputPath, "ab") as f:
writer = csv.writer(f)
writer.writerows(output)
print "Output written to {}.".format(outputPath)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 22 14:45:53 2016
@author: rpeckner
"""
#!/usr/bin/python
# See http://maggotroot.blogspot.ch/2013/11/constrained-linear-least-squares-in.html for more info
'''
A simple library to solve constrained linear least squares problems
with sparse and dense matrices. Uses cvxopt library for
optimization
'''
__author__ = 'Valeriy Vishnevskiy'
__email__ = 'valera.vishnevskiy@yandex.ru'
__version__ = '1.0'
__date__ = '22.11.2013'
__license__ = 'WTFPL'
import numpy as np
import sys
#sys.path.append("/home/unix/rpeckner/anaconda2/lib/python2.7/site-packages/cvxopt/")
from cvxopt import solvers, matrix, spmatrix, mul
import itertools
from scipy import sparse
def scipy_sparse_to_spmatrix(A):
coo = A.tocoo()
SP = spmatrix(coo.data, coo.row.tolist(), coo.col.tolist())
return SP
def spmatrix_sparse_to_scipy(A):
data = np.array(A.V).squeeze()
rows = np.array(A.I).squeeze()
cols = np.array(A.J).squeeze()
return sparse.coo_matrix( (data, (rows, cols)) )
def sparse_None_vstack(A1, A2):
if A1 is None:
return A2
else:
return sparse.vstack([A1, A2])
def numpy_None_vstack(A1, A2):
if A1 is None:
return A2
else:
return np.vstack([A1, A2])
def numpy_None_concatenate(A1, A2):
if A1 is None:
return A2
else:
return np.concatenate([A1, A2])
def get_shape(A):
if isinstance(C, spmatrix):
return C.size
else:
return C.shape
def numpy_to_cvxopt_matrix(A):
if A is None:
return A
if sparse.issparse(A):
if isinstance(A, sparse.spmatrix):
return scipy_sparse_to_spmatrix(A)
else:
return A
else:
if isinstance(A, np.ndarray):
if A.ndim == 1:
return matrix(A, (A.shape[0], 1), 'd')
else:
return matrix(A, A.shape, 'd')
else:
return A
def cvxopt_to_numpy_matrix(A):
if A is None:
return A
if isinstance(A, spmatrix):
return spmatrix_sparse_to_scipy(A)
elif isinstance(A, matrix):
return np.array(A).squeeze()
else:
return np.array(A).squeeze()
def lsqlin(C, d, reg=0, A=None, b=None, Aeq=None, beq=None, \
lb=None, ub=None, x0=None, opts=None):
'''
Solve linear constrained l2-regularized least squares. Can
handle both dense and sparse matrices. Matlab's lsqlin
equivalent. It is actually wrapper around CVXOPT QP solver.
min_x ||C*x - d||^2_2 + reg * ||x||^2_2
s.t. A * x <= b
Aeq * x = beq
lb <= x <= ub
Input arguments:
C is m x n dense or sparse matrix
d is m x 1 dense matrix
reg is regularization parameter
A is p x n dense or sparse matrix
b is p x 1 dense matrix
Aeq is q x n dense or sparse matrix
beq is q x 1 dense matrix
lb is n x 1 matrix or scalar
ub is n x 1 matrix or scalar
Output arguments:
Return dictionary, the output of CVXOPT QP.
Dont pass matlab-like empty lists to avoid setting parameters,
just use None:
lsqlin(C, d, 0.05, None, None, Aeq, beq) #Correct
lsqlin(C, d, 0.05, [], [], Aeq, beq) #Wrong!
'''
sparse_case = False
if sparse.issparse(A): #detects both np and cxopt sparse
sparse_case = True
#We need A to be scipy sparse, as I couldn't find how
#CVXOPT spmatrix can be vstacked
if isinstance(A, spmatrix):
A = spmatrix_sparse_to_scipy(A)
C = numpy_to_cvxopt_matrix(C)
d = numpy_to_cvxopt_matrix(d)
Q = C.T * C
q = - d.T * C
nvars = C.size[1]
if reg > 0:
if sparse_case:
I = scipy_sparse_to_spmatrix(sparse.eye(nvars, nvars,\
format='coo'))
else:
I = matrix(np.eye(nvars), (nvars, nvars), 'd')
Q = Q + reg * I
lb = cvxopt_to_numpy_matrix(lb)
ub = cvxopt_to_numpy_matrix(ub)
b = cvxopt_to_numpy_matrix(b)
if lb is not None: #Modify 'A' and 'b' to add lb inequalities
if lb.size == 1:
lb = np.repeat(lb, nvars)
if sparse_case:
lb_A = -sparse.eye(nvars, nvars, format='coo')
A = sparse_None_vstack(A, lb_A)
else:
lb_A = -np.eye(nvars)
A = numpy_None_vstack(A, lb_A)
b = numpy_None_concatenate(b, -lb)
if ub is not None: #Modify 'A' and 'b' to add ub inequalities
if ub.size == 1:
ub = np.repeat(ub, nvars)
if sparse_case:
ub_A = sparse.eye(nvars, nvars, format='coo')
A = sparse_None_vstack(A, ub_A)
else:
ub_A = np.eye(nvars)
A = numpy_None_vstack(A, ub_A)
b = numpy_None_concatenate(b, ub)
#Convert data to CVXOPT format
A = numpy_to_cvxopt_matrix(A)
Aeq = numpy_to_cvxopt_matrix(Aeq)
b = numpy_to_cvxopt_matrix(b)
beq = numpy_to_cvxopt_matrix(beq)
#Set up options
if opts is not None:
for k, v in opts.items():
solvers.options[k] = v
#Run CVXOPT.SQP solver
sol = solvers.qp(Q, q.T, A, b, Aeq, beq, None, x0)
return sol
def lsqnonneg(C, d, opts):
'''
Solves nonnegative linear least-squares problem:
min_x ||C*x - d||_2^2, where x >= 0
'''
return lsqlin(C, d, reg = 0, A = None, b = None, Aeq = None, \
beq = None, lb = 0, ub = None, x0 = None, opts = opts)
if __name__ == '__main__':
# simple Testing routines
C = np.array(np.mat('''0.9501,0.7620,0.6153,0.4057;
0.2311,0.4564,0.7919,0.9354;
0.6068,0.0185,0.9218,0.9169;
0.4859,0.8214,0.7382,0.4102;
0.8912,0.4447,0.1762,0.8936'''))
sC = sparse.coo_matrix(C)
csC = scipy_sparse_to_spmatrix(sC)
A = np.array(np.mat('''0.2027,0.2721,0.7467,0.4659;
0.1987,0.1988,0.4450,0.4186;
0.6037,0.0152,0.9318,0.8462'''))
sA = sparse.coo_matrix(A)
csA = scipy_sparse_to_spmatrix(sA)
d = np.array([0.0578, 0.3528, 0.8131, 0.0098, 0.1388])
md = matrix(d)
b = np.array([0.5251, 0.2026, 0.6721])
mb = matrix(b)
lb = np.array([-0.1] * 4)
mlb = matrix(lb)
mmlb = -0.1
ub = np.array([2] * 4)
mub = matrix(ub)
mmub = 2
#solvers.options[show_progress'] = False
opts = {'show_progress': False}
for iC in [C, sC, csC]:
for iA in [A, sA, csA]:
for iD in [d, md]:
for ilb in [lb, mlb, mmlb]:
for iub in [ub, mub, mmub]:
for ib in [b, mb]:
ret = lsqlin(iC, iD, 0, iA, ib, None, None, ilb, iub, None, opts)
print ret['x'].T
print 'Should be [-1.00e-01 -1.00e-01 2.15e-01 3.50e-01]'
#test lsqnonneg
C = np.array([[0.0372, 0.2869], [0.6861, 0.7071], [0.6233, 0.6245], [0.6344, 0.6170]]);
d = np.array([0.8587, 0.1781, 0.0747, 0.8405]);
ret = lsqnonneg(C, d, {'show_progress': False})
print ret['x'].T
print 'Should be [2.5e-07; 6.93e-01]'
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment