Commit 964612cf authored by rpeckner-broad's avatar rpeckner-broad Committed by GitHub
Browse files

Add files via upload

parent 3b7f65b7
#!/bin/bash
spark-submit --driver-memory $1 Specter_Spark.py $2 $3 $4 $5 $6 $7 $8
Rscript SpecterQuant.R $2 $3
\ No newline at end of file
SparkSpecterCoeffs = function(rawCoeffsFilepath,ExperimentHeader=NULL, lib=NULL,MS1=FALSE)
{ library(data.table)
DIA_RefSpectraCoeffs = read.csv(rawCoeffsFilepath,header=FALSE,stringsAsFactors = FALSE)
names(DIA_RefSpectraCoeffs) = c("Coeff","Scan","RefPeptideSeq", "RefPrecursorCharge","PrecursorMZ","RetentionTime")
DIA_RefSpectraCoeffs = DIA_RefSpectraCoeffs[c(2,1,3,4,5,6)]
DIA_RefSpectraCoeffs$Coeff = as.numeric(DIA_RefSpectraCoeffs$Coeff)
DIA_RefSpectraCoeffs$Scan = strtoi(DIA_RefSpectraCoeffs$Scan) + 1
DIA_RefSpectraCoeffs$RefPeptideSeq = as.character(DIA_RefSpectraCoeffs$RefPeptideSeq)
DIA_RefSpectraCoeffs$RefPrecursorCharge = strtoi(DIA_RefSpectraCoeffs$RefPrecursorCharge)
DIA_RefSpectraCoeffs$PrecursorMZ = as.numeric(DIA_RefSpectraCoeffs$PrecursorMZ)
DIA_RefSpectraCoeffs$RetentionTime = as.numeric(DIA_RefSpectraCoeffs$RetentionTime)
DIA_RefSpectraCoeffs = na.omit(DIA_RefSpectraCoeffs)
DIA_RefSpectraCoeffs = DIA_RefSpectraCoeffs[with(DIA_RefSpectraCoeffs,order(Scan, RefPeptideSeq, RefPrecursorCharge)),]
#Convert minutes to seconds if need be
if (max(DIA_RefSpectraCoeffs$RetentionTime) < 600)
DIA_RefSpectraCoeffs$RetentionTime = 60*DIA_RefSpectraCoeffs$RetentionTime
DIA_RefSpectraCoeffs = DIA_RefSpectraCoeffs[which(DIA_RefSpectraCoeffs$Coeff != 0),]
DIA_RefSpectraCoeffs = data.table(DIA_RefSpectraCoeffs)
setkey(DIA_RefSpectraCoeffs,RefPeptideSeq,RefPrecursorCharge)
return(DIA_RefSpectraCoeffs)
}
QuantifyAllFromSpecterCoeffs = function(SpecterData,header) {
require(kza)
require(pracma)
require(zoo)
require(moments)
require(data.table)
IDs = data.frame(unique(SpecterData[,c('RefPeptideSeq','RefPrecursorCharge'),with=FALSE]))
IsThereEnoughData = unlist(Map(function(i) EnoughData(SpecterData,IDs,i,RTWindow=FALSE),
seq(1,nrow(IDs))))
f = function(k) {if (IsThereEnoughData[k]) {
SpecterQuantifyPeptides(SpecterData,IDs,k,header,RTWindow=FALSE)} else {list(0,1,0,0,0,0,1,0,0,0)}}
Quant = Map(f,seq(1,nrow(IDs)))
Quants = IDs
Quants$SpecterAreaQuant = as.numeric(unlist(lapply(Quant,function(l) l[[1]][1])))
Quants$SpecterLBPval = as.numeric(unlist(lapply(Quant,function(l) l[[2]][1])))
Quants$SpecterSNR = as.numeric(unlist(lapply(Quant,function(l) l[[3]][1])))
Quants$MaxSpecterCoeffTime = as.numeric(unlist(lapply(Quant,function(l) l[[4]][1])))
Quants$MaxSpecterCoeff = as.numeric(unlist(lapply(Quant,function(l) l[[5]][1])))
Quants$SpecterPeakWidth = as.numeric(unlist(lapply(Quant,function(l) l[[6]][1])))
Quants$SpecterLBPvalOnUniformGrid = as.numeric(unlist(lapply(Quant,function(l) l[[7]][1])))
Quants$PeakVariance = as.numeric(unlist(lapply(Quant,function(l) l[[8]][1])))
Quants$PeakSkewness = as.numeric(unlist(lapply(Quant,function(l) l[[9]][1])))
Quants$PeakKurtosis = as.numeric(unlist(lapply(Quant,function(l) l[[10]][1])))
return(Quants)
}
###################### PROFILE SMOOTHING AND PLOTTING ###########################
SmoothProfile = function(Data,Identifiers,i,header,IntensityCutoff=0,QuantileCutoff=0,RTWindow=TRUE,param=1e-2,type="rollmean",
Plot=TRUE,FilterWindow= 3,KZiters=3,verbose=FALSE,MS1=FALSE,Experiment=NULL,Header=NULL) {
PeptideData = data.frame(Data[list(Identifiers$RefPeptideSeq[i],Identifiers$RefPrecursorCharge[i])])
PrecMZ = unique(PeptideData$RefPrecursorMZ)
PeptideDataNoCutoff = data.frame(PeptideData)
if (RTWindow)
{PeptideData = PeptideData[which(abs(PeptideData$RetentionTime - PeptideData$RefRetentionTime) < 300),]
PeptideDataNoCutoff = PeptideDataNoCutoff[which(abs(PeptideDataNoCutoff$RetentionTime - PeptideDataNoCutoff$RefRetentionTime) < 300),]}
if (verbose)
print(PeptideData)
PeptideData = PeptideData[which(PeptideData$Coeff >= IntensityCutoff),]
PeptideData = PeptideData[which(PeptideData$Coeff >= quantile(PeptideData$Coeff,QuantileCutoff)),]
PeptideDataNoCutoff = PeptideDataNoCutoff[c("RetentionTime","Coeff","PrecursorMZ")]
PeptideDataNoCutoff = PeptideDataNoCutoff[!duplicated(PeptideDataNoCutoff$RetentionTime),]
PeptideData = PeptideData[c("Scan","RetentionTime","Coeff","PrecursorMZ")]
PeptideData = PeptideData[!duplicated(PeptideData$RetentionTime),]
SmoothFrame = data.frame(RetentionTime=numeric(),Coeff=numeric())
KZData = NULL
if (type == "SG")
{SmoothedData = sgolayfilt(PeptideDataOnGrid[,2],p=0,n=3)
SmoothFrame = data.frame(RetentionTime=PeptideDataOnGrid[,1],Coeff=SmoothedData)
}
if (type == "rollmean")
{ZooData = zoo(PeptideData$Coeff, PeptideData$RetentionTime)
SmoothedData = zoo(0,0)
PeptideData$Coeff[which(PeptideData$Coeff < 1)] = 0
if (nrow(PeptideData) > 3) {
MinScanIndex = min(PeptideData$Scan,na.rm=TRUE)
MaxScanIndex = max(PeptideData$Scan,na.rm=TRUE)
HeaderRange = data.frame(header[J(MinScanIndex:MaxScanIndex)]) #header needs to be a data.table with key seqNum
HeaderRange = subset(HeaderRange,msLevel == 2 & floor(precursorMZ) %in% floor(unique(PeptideData$PrecursorMZ)))
DataOnUniformGrid = HeaderRange[c("seqNum","retentionTime")]
DataOnUniformGrid$Coeff = numeric(nrow(DataOnUniformGrid))
DataOnUniformGrid$Coeff[which(DataOnUniformGrid$seqNum %in% PeptideData$Scan)] = PeptideData$Coeff
DataOnUniformGrid = DataOnUniformGrid$Coeff
KZData = kz(DataOnUniformGrid,m=FilterWindow,k=KZiters)}
}
BoxTestPval = 1
if (Plot) {
n = nrow(PeptideDataNoCutoff)
m = nrow(PeptideData)
CombinedData = rbind(PeptideDataNoCutoff,PeptideData)
CombinedData = rbind(CombinedData,KZFrame)
CombinedData$Type = paste0("Kolmogorov-Zurbenko"," (",KZiters," iterations)")
CombinedData$Type[1:n] = "Raw"
CombinedData$Type[(n+1):(n+m)] = "Cutoff"
CombinedData$Type = factor(CombinedData$Type, levels=c("Raw","Cutoff",paste0("Kolmogorov-Zurbenko"," (",KZiters," iterations)")))
if (MS1)
{
MS1Data = Header[which(Header$msLevel == 1),]
PrecursorMS1Candidates = findInterval(PeptideDataNoCutoff$RetentionTime,MS1Data$retentionTime)
PrecursorMS1CandidateScans = mzR::peaks(Experiment,scans=which(Header$msLevel == 1)[PrecursorMS1Candidates])
MS1Intensity = data.frame(RetentionTime=numeric(),Coeff=numeric())
for (k in 1:length(PrecursorMS1Candidates))
{if (any(abs(PrecursorMS1CandidateScans[[k]][,1] - PrecMZ) < 1e-5*PrecMZ))
{ClosestPeak = which(abs(PrecursorMS1CandidateScans[[k]][,1] - PrecMZ) == min(abs(PrecursorMS1CandidateScans[[k]][,1] - PrecMZ)))
ClosestPeakIntensity = PrecursorMS1CandidateScans[[k]][ClosestPeak,2]
MS1Intensity = rbind(MS1Intensity,data.frame(RetentionTime=MS1Data$retentionTime[PrecursorMS1Candidates[k]],Coeff=ClosestPeakIntensity,
PrecursorMZ = PrecursorMS1CandidateScans[[k]][ClosestPeak,1]))
}
}
MS1Intensity$Type = "MS1 (closest precursor m/z)"
CombinedData = rbind(CombinedData,MS1Intensity)
CombinedData$Type = factor(CombinedData$Type, levels=c("Raw","Cutoff","Moving average",paste0("Kolmogorov-Zurbenko"," (",KZiters," iterations)"),
"MS1 (closest precursor m/z)"))
}
p = ggplot(CombinedData,aes(x=RetentionTime,y=Coeff,group=1)) +
geom_line(size=1) + facet_wrap(~Type,ncol=1) + theme(legend.title=element_blank()) +
ggtitle(paste0(Identifiers$RefPeptideSeq[i],", Charge = ",Identifiers$RefPrecursorCharge[i],
", RefPrecursorMZ = ",round(PrecMZ,digits=4),
", Cutoff = ",quantile(PeptideData$Coeff,QuantileCutoff),
", Box test p-val = ",BoxTestPval))
print(p)
}
return(KZData)
}
###################### PEAK DETECTION ###########################
FindPeaksInData = function(Data,Identifiers,i,header,Multiplexed=FALSE,IntensityCutoff=0,
QuantileCutoff=0,RTWindow=TRUE,smooth="rollmean",FilterWindow= 3,KZiters=3) {
PeptideData = data.frame(Data[list(Identifiers$RefPeptideSeq[i],Identifiers$RefPrecursorCharge[i])])
if (RTWindow)
PeptideData = subset(PeptideData, abs(RetentionTime - RefRetentionTime) < 300)
PeptideData = PeptideData[c("Scan","RetentionTime","Coeff","PrecursorMZ")]
PeptideData = PeptideData[!duplicated(PeptideData$RetentionTime),]
PeptideData$Coeff[which(PeptideData$Coeff < 1)] = 0
PeptidePeaks = NULL
if (length(which(PeptideData$Coeff > 0) > 3)) {
MinScanIndex = min(PeptideData$Scan,na.rm=TRUE)
MaxScanIndex = max(PeptideData$Scan,na.rm=TRUE)
HeaderRange = data.frame(header[J(MinScanIndex:MaxScanIndex)]) #header needs to be a data.table with key seqNum
HeaderRange = subset(HeaderRange,msLevel == 2 & floor(precursorMZ) %in% floor(unique(PeptideData$PrecursorMZ)))
DataOnUniformGrid = HeaderRange[c("seqNum","retentionTime")]
DataOnUniformGrid$Coeff = numeric(nrow(DataOnUniformGrid))
DataOnUniformGrid$Coeff[which(DataOnUniformGrid$seqNum %in% PeptideData$Scan)] = PeptideData$Coeff
DataOnUniformGrid = DataOnUniformGrid$Coeff
PeptidePeaks = findpeaks(DataOnUniformGrid, nups = 2, ndowns = 2, npeaks= 10,sortstr = TRUE)
if (smooth == "rollmean")
{
KZData = kz(DataOnUniformGrid,m=FilterWindow,k=KZiters)
KZData[which(DataOnUniformGrid < 1)] = 0 #Remove artifical nonzero coeffs resulting from smoothing
PeptidePeaks = findpeaks(KZData, nups = 2, ndowns = 2, npeaks= 10,sortstr = TRUE)
}
}
return(PeptidePeaks)
}
###################### QUANTITATION ###########################
SpecterQuantifyPeptides = function(Data,Identifiers,i,header,IntensityCutoff=0,QuantileCutoff=0,RTWindow=TRUE,smooth="rollmean",FilterWindow= 3,KZiters=3) {
PeptideData = data.frame(Data[list(Identifiers$RefPeptideSeq[i],Identifiers$RefPrecursorCharge[i])])
if (RTWindow)
PeptideData = subset(PeptideData, abs(RetentionTime - RefRetentionTime) < 300)
PeptideData = PeptideData[c("Scan","RetentionTime","Coeff","PrecursorMZ")]
PeptideData = PeptideData[!duplicated(PeptideData$RetentionTime),]
PeptideData$Coeff[which(PeptideData$Coeff < 1)] = 0 #Total ion intensities less than 1 are discarded as physically meaningless.
area = 0
RawPeaks = FindPeaksInData(Data,Identifiers,i,header,IntensityCutoff=IntensityCutoff,
QuantileCutoff=QuantileCutoff,RTWindow=RTWindow,smooth="none",FilterWindow=FilterWindow)
SmoothPeaks = FindPeaksInData(Data,Identifiers,i,header,
IntensityCutoff=IntensityCutoff,QuantileCutoff=QuantileCutoff,RTWindow=RTWindow,smooth=smooth,FilterWindow=FilterWindow,KZiters=KZiters)
BoxTestPval = 1
BoxTestPvalOnGrid = 1
MaxCoeff= 0
TimeAtTopOfPeak = 0
snr = 0
PeakCentralMoments = numeric(3)
result = list(Area=area,BoxTestPval=BoxTestPval,SNR=snr,RT=TimeAtTopOfPeak,MaxCoeff=0,PeakWidth=0,BoxTestPvalOnGrid=BoxTestPvalOnGrid,
Variance=0,Skewness=0,Kurtosis=0)
if (length(which(PeptideData$Coeff > 1) > 3) & !is.null(SmoothPeaks)) {
MinScanIndex = min(PeptideData$Scan,na.rm=TRUE)
MaxScanIndex = max(PeptideData$Scan,na.rm=TRUE)
HeaderRange = data.frame(header[J(MinScanIndex:MaxScanIndex)]) #header needs to be a data.table with key seqNum
HeaderRange = subset(HeaderRange,msLevel == 2 & floor(precursorMZ) %in% floor(unique(PeptideData$PrecursorMZ)))
DataOnUniformGrid = HeaderRange[c("seqNum","retentionTime")]
DataOnUniformGrid$Coeff = numeric(nrow(DataOnUniformGrid))
DataOnUniformGrid$Coeff[which(DataOnUniformGrid$seqNum %in% PeptideData$Scan)] = PeptideData$Coeff
start = SmoothPeaks[1,3]
end = SmoothPeaks[1,4]
PeakDataOnUniformGrid = DataOnUniformGrid[start:end,]
area = trapz(PeakDataOnUniformGrid$retentionTime, PeakDataOnUniformGrid$Coeff)
TimeAtTopOfPeak = DataOnUniformGrid$retentionTime[SmoothPeaks[1,2]]
MaxCoeff=DataOnUniformGrid$Coeff[SmoothPeaks[1,2]]
start = SmoothPeaks[1,3]
end = SmoothPeaks[1,4]
PeakDataOnUniformGrid = DataOnUniformGrid[start:end,]
PeakCentralMoments = all.moments(scale(PeakDataOnUniformGrid$Coeff,center=FALSE),order.max = 4,central = TRUE)[3:5]
BoxTest = Box.test(PeptideData$Coeff,type="Ljung-Box")
BoxTestPval = round(BoxTest$p.value,digits=5)
BoxTestOnGrid = Box.test(DataOnUniformGrid$Coeff,type="Ljung-Box")
BoxTestPvalOnGrid = round(BoxTestOnGrid$p.value,digits=5)
snr=area/sd(DataOnUniformGrid$Coeff[-seq(start,end)]) #Denominator is the standard deviation of the coefficients not falling within the highest peak
peakWidth = DataOnUniformGrid$retentionTime[end] - DataOnUniformGrid$retentionTime[start]
result = list(Area=area,BoxTestPval=BoxTestPval,SNR=snr,RT=TimeAtTopOfPeak,MaxCoeff=MaxCoeff,PeakWidth=peakWidth,
BoxTestPvalOnGrid=BoxTestPvalOnGrid,Variance=PeakCentralMoments[1],Skewness=PeakCentralMoments[2],Kurtosis=PeakCentralMoments[3])
}
return(result)
}
EnoughData = function(Data,Identifiers,i,RTWindow=TRUE) {
PeptideData = Data[list(Identifiers$RefPeptideSeq[i],Identifiers$RefPrecursorCharge[i])]
if (RTWindow)
PeptideData = subset(PeptideData, abs(RetentionTime - RefRetentionTime) < 300)
if (nrow(PeptideData) > 0)
{return(TRUE)} else {
return(FALSE)
}
}
library(MASS)
library(data.table)
args=commandArgs(TRUE)
mzmlPath = paste0(args[1],".mzML")
dirName = strsplit(mzmlPath,split="/")[[1]]
mzmlName= gsub(".mzML","",dirName[length(dirName)])
libName = strsplit(args[2],split="/")[[1]]
libName = libName[length(libName)]
dirName = dirName[-length(dirName)]
dirName = paste(dirName,collapse="/")
resultsPath = paste0(dirName,"/SpecterResults/",mzmlName,"_",libName,"_SpecterCoeffs.csv")
headerPath = paste0(dirName,"/SpecterResults/",mzmlName,"_",libName,"_header.csv")
decoyResultsPath = paste0(gsub(".csv","",resultsPath),"Decoys.csv")
h = read.csv(headerPath,header=FALSE,stringsAsFactors=FALSE)
names(h) = c("precursorMZ","retentionTime","seqNum")
h$retentionTime = 60*h$retentionTime
h$seqNum = h$seqNum + 1
h$msLevel = 2
h = data.table(h)
setkey(h,seqNum)
results = SparkSpecterCoeffs(resultsPath)
resultsWithDecoys = SparkSpecterCoeffs(decoyResultsPath)
quants = QuantifyAllFromSpecterCoeffs(results,header = h)
quantsWithDecoys = QuantifyAllFromSpecterCoeffs(resultsWithDecoys,header=h)
quantsWithDecoys$Type = ifelse(grepl("DECOY",quantsWithDecoys$RefPeptideSeq),"Decoy","Target")
quantsWithDecoys = quantsWithDecoys[which(quantsWithDecoys$SpecterAreaQuant > 0),]
SpecterLDA = lda(Type~ MaxSpecterCoeff + PeakVariance + PeakSkewness + PeakKurtosis,
data = quantsWithDecoys)
plda = predict(object = SpecterLDA, newdata = quantsWithDecoys)
D = data.frame(PeptideSeq = quantsWithDecoys$RefPeptideSeq,
PrecursorCharge = quantsWithDecoys$RefPrecursorCharge,
Type=quantsWithDecoys$Type,
Score=as.numeric(unlist(plda$x)))
FDR = function(score) {
return(length(which(D$Type == "Decoy" & D$Score > score))/length(which(D$Score > score)))
}
fdr = as.numeric(sapply(D$Score,FDR))
cutoff = min(D$Score[which(fdr < 0.01)],na.rm=TRUE)
D = D[which(D$Score >= cutoff),]
quants = quants[which(paste(quants$RefPeptideSeq,quants$RefPrecursorCharge) %in% paste(D$PeptideSeq,D$PrecursorCharge)),]
quants$Score = D$Score[match(paste(quants$RefPeptideSeq,quants$RefPrecursorCharge),paste(D$PeptideSeq,D$PrecursorCharge))]
outputPath = gsub("_SpecterCoeffs.csv","_SpecterQuants.csv",resultsPath)
write.csv(quants[c("RefPeptideSeq","RefPrecursorCharge","SpecterAreaQuant","Score")],file=outputPath,quote=FALSE,row.names=FALSE)
import sys
import time
import pandas as pd
import pymzml
import numpy as np
from numpy import linalg
from scipy import stats
from scipy import integrate
from scipy import sparse
from scipy.optimize import fsolve
from scipy.stats import mvn
import os
import random
import itertools
import cPickle as pickle
from functools import partial
import sparse_nnls
from pyspark import SparkConf,SparkContext
import sqlite3
import struct
import zlib
import csv
import re
def RegressSpectraOntoLibrary(DIASpectraIterator,Library,tol,maxWindowOffset):
RefSpectraLibrary = Library.value
for DIASpectrum in DIASpectraIterator:
precMZ = float(DIASpectrum[1])
precRT = float(DIASpectrum[2]) #MS2 scan retention time, in minutes
index = DIASpectrum[3]
windowWidth = DIASpectrum[4]
DIASpectrum = np.array(DIASpectrum[0])
LibraryCoeffs = []
if len(DIASpectrum.shape) == 2:
if windowWidth > 0:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth/2]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth/2]
else:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset/2]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset/2]
#MERGING OF POINTS IN ACQUIRED SPECTRUM WITH NEARBY M/Z COORDINATES
MergedDIASpecCoordIndices = np.searchsorted(DIASpectrum[:,0]+tol*DIASpectrum[:,0],DIASpectrum[:,0])
MergedDIASpecCoords = DIASpectrum[np.unique(MergedDIASpecCoordIndices),0]
MergedDIASpecIntensities = [np.mean(DIASpectrum[np.where(MergedDIASpecCoordIndices == i)[0],1]) for i in np.unique(MergedDIASpecCoordIndices)]
DIASpectrum = np.array((MergedDIASpecCoords,MergedDIASpecIntensities)).transpose()
#FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM
CentroidBreaks = np.concatenate((DIASpectrum[:,0]-tol*DIASpectrum[:,0],DIASpectrum[:,0]+tol*DIASpectrum[:,0]))
CentroidBreaks = np.sort(CentroidBreaks)
LocateReferenceCoordsInDIA = [np.searchsorted(CentroidBreaks,M[:,0]) for M in CandidateRefSpectraLibrary]
TopTenPeaksCoordsInDIA = [np.searchsorted(CentroidBreaks,M[np.argsort(-M[:,1])[0:min(10,M.shape[0])],0]) for M in CandidateRefSpectraLibrary]
ReferencePeaksInDIA = [i for i in range(len(MassWindowCandidates)) if
len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5] #min(3,CandidateRefSpectraLibrary[i].shape[0])]
ProportionOfReferencePeaksInDIA = [len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1])/CandidateRefSpectraLibrary[i].shape[0]
for i in range(len(MassWindowCandidates))]
RefPeptideCandidatesLocations = [LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA]
RefPeptideCandidateList = [CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA]
RefPeptideCandidates = [MassWindowCandidates[i] for i in ReferencePeaksInDIA]
NormalizedRefPeptideCandidateList = [M[:,1]/sum(M[:,1]) for M in RefPeptideCandidateList]
RefSpectraLibrarySparseRowIndices = (np.array([i for v in RefPeptideCandidatesLocations for i in v if i % 2 == 1]) + 1)/2
RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1 #Respect the 0-indexing
RefSpectraLibrarySparseColumnIndices = np.array([i for j in range(len(RefPeptideCandidates)) for i in [j]*len([k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1])])
RefSpectraLibrarySparseMatrixEntries = np.array([NormalizedRefPeptideCandidateList[k][i] for k in range(len(NormalizedRefPeptideCandidateList)) for i in range(len(NormalizedRefPeptideCandidateList[k]))
if RefPeptideCandidatesLocations[k][i] % 2 == 1])
if (len(RefSpectraLibrarySparseRowIndices) > 0 and len(RefSpectraLibrarySparseColumnIndices) > 0 and len(RefSpectraLibrarySparseMatrixEntries) > 0):
UniqueRowIndices = [i for i in set(RefSpectraLibrarySparseRowIndices)]
UniqueRowIndices.sort()
DIASpectrumIntensities=DIASpectrum[UniqueRowIndices,1] #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry
DIASpectrumIntensities=np.append(DIASpectrumIntensities,[0]) #Add a zero to the end of the DIA data vector to penalize
#peaks of library spectra not present in the DIA spectrum
#AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM
ReferencePeaksNotInDIA = np.array([k for v in RefPeptideCandidatesLocations for k in range(len(v)) if v[k] % 2 == 0])
SparseColumnIndicesForPeaksNotInDIA = np.arange(len(RefPeptideCandidates))
NumRowsOfLibraryMatrix = max(UniqueRowIndices)
SparseRowIndicesForPeaksNotInDIA = [NumRowsOfLibraryMatrix+1]*len(SparseColumnIndicesForPeaksNotInDIA)
#Duplicate (i,j) entries are summed together, yielding total ion intensities
SparseMatrixEntriesForPeaksNotInDIA = np.array([np.sum([NormalizedRefPeptideCandidateList[j][k]
for k in range(len(NormalizedRefPeptideCandidateList[j]))
if RefPeptideCandidatesLocations[j][k] % 2 == 0])
for j in range(len(NormalizedRefPeptideCandidateList))])
SparseRowIndices=np.append(RefSpectraLibrarySparseRowIndices,SparseRowIndicesForPeaksNotInDIA)
SparseColumnIndices=np.append(RefSpectraLibrarySparseColumnIndices,SparseColumnIndicesForPeaksNotInDIA)
SparseMatrixEntries=np.append(RefSpectraLibrarySparseMatrixEntries,SparseMatrixEntriesForPeaksNotInDIA)
SparseRowIndices = stats.rankdata(SparseRowIndices,method='dense').astype(int) - 1 #Renumber the row indices according to the projected spectrum,
#respecting the 0-indexing
LibrarySparseMatrix = sparse.coo_matrix((SparseMatrixEntries,(SparseRowIndices,SparseColumnIndices)))
LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix,DIASpectrumIntensities,{'show_progress': False})
LibraryCoeffs = LibraryCoeffs['x']
NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0]
NonzeroCoeffsAboveThreshold = NonzeroCoeffs
Output = [[0,index,0,0,0,0]]
if len(NonzeroCoeffs) > 0:
RefSpectraIDs = [RefPeptideCandidates[j] for j in range(len(RefPeptideCandidates)) if LibraryCoeffs[j] != 0]
Output = [[NonzeroCoeffsAboveThreshold[i],index,RefSpectraIDs[i][0],RefSpectraIDs[i][1],precMZ,precRT] for i in range(len(NonzeroCoeffsAboveThreshold))]
yield Output
def RegressSpectraOntoLibraryWithDecoys(DIASpectraIterator,Library,tol,maxWindowOffset):
RefSpectraLibrary = Library.value
for DIASpectrum in DIASpectraIterator:
precMZ = float(DIASpectrum[1])
precRT = float(DIASpectrum[2]) #MS2 scan retention time, in minutes
index = DIASpectrum[3]
windowWidth = DIASpectrum[4]
DIASpectrum = np.array(DIASpectrum[0])
LibraryCoeffs = []
if len(DIASpectrum.shape) == 2:
if windowWidth > 0:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth/2]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth/2]
CandidateDecoyLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
windowWidth/2 <= abs(float(spectrum['PrecursorMZ']) - precMZ) <= windowWidth]
MassWindowDecoyCandidates = [("DECOY_"+key[0],key[1]) for key,spectrum in RefSpectraLibrary.iteritems() if
windowWidth/2 <= abs(float(spectrum['PrecursorMZ']) - precMZ) <= windowWidth]
else:
CandidateRefSpectraLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset/2]
MassWindowCandidates = [key for key,spectrum in RefSpectraLibrary.iteritems() if
float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset/2]
CandidateDecoyLibrary = [spectrum['Spectrum'] for key,spectrum in RefSpectraLibrary.iteritems() if
precMZ - maxWindowOffset <= float(spectrum['PrecursorMZ']) <= precMZ - maxWindowOffset/2]
MassWindowDecoyCandidates = [("DECOY_"+key[0],key[1]) for key,spectrum in RefSpectraLibrary.iteritems() if
precMZ - maxWindowOffset <= float(spectrum['PrecursorMZ']) <= precMZ - maxWindowOffset/2]
#FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM
CentroidBreaks = np.concatenate((DIASpectrum[:,0]-tol*DIASpectrum[:,0],DIASpectrum[:,0]+tol*DIASpectrum[:,0]))
CentroidBreaks.sort()
LocateReferenceCoordsInDIA = [np.searchsorted(CentroidBreaks,M[:,0]) for M in CandidateRefSpectraLibrary]
#Hard cutoff - at least 5 of the 10 most intense peaks (or all peaks if there are fewer than 3) of reference spectrum must appear in acquired spectrum
TopTenPeaksCoordsInDIA = [np.searchsorted(CentroidBreaks,M[np.argsort(-M[:,1])[0:min(10,M.shape[0])],0]) for M in CandidateRefSpectraLibrary]
ReferencePeaksInDIA = [i for i in range(len(MassWindowCandidates)) if
len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5] #min(3,CandidateRefSpectraLibrary[i].shape[0])]
#SHIFT ALL FRAGMENT ION PEAKS OF ALL DECOY SPECTRA BY 20 M/Z TO ENSURE DISSIMILARITY FROM REAL SPECTRA
LocateDecoyCoordsInDIA = [np.searchsorted(CentroidBreaks,M[:,0]+20) for M in CandidateDecoyLibrary]
TopTenPeaksCoordsInDIA = [np.searchsorted(CentroidBreaks,M[np.argsort(-M[:,1])[0:min(10,M.shape[0])],0]+20) for M in CandidateDecoyLibrary]
DecoyPeaksInDIA = [i for i in range(len(MassWindowDecoyCandidates)) if
len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5] #min(3,CandidateRefSpectraLibrary[i].shape[0])]
RefPeptideCandidatesLocations = [LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA]
RefPeptideCandidateList = [CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA]
RefPeptideCandidates = [MassWindowCandidates[i] for i in ReferencePeaksInDIA]
NormalizedRefPeptideCandidateList = [M[:,1]/sum(M[:,1]) for M in RefPeptideCandidateList]
DecoyCandidatesLocations = [LocateDecoyCoordsInDIA[i] for i in DecoyPeaksInDIA]
DecoyCandidateList = [CandidateDecoyLibrary[i] for i in DecoyPeaksInDIA]
DecoyCandidates = [MassWindowDecoyCandidates[i] for i in DecoyPeaksInDIA]
NormalizedDecoyCandidateList = [M[:,1]/sum(M[:,1]) for M in DecoyCandidateList]
RefSpectraLibrarySparseRowIndices = (np.array([i for v in RefPeptideCandidatesLocations for i in v if i % 2 == 1]) + 1)/2
RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1 #Respect the 0-indexing
RefSpectraLibrarySparseColumnIndices = np.array([i for j in range(len(RefPeptideCandidates)) for i in [j]*len([k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1])])
RefSpectraLibrarySparseMatrixEntries = np.array([NormalizedRefPeptideCandidateList[k][i] for k in range(len(NormalizedRefPeptideCandidateList)) for i in range(len(NormalizedRefPeptideCandidateList[k]))
if RefPeptideCandidatesLocations[k][i] % 2 == 1])
if (len(RefSpectraLibrarySparseRowIndices) > 0 and len(RefSpectraLibrarySparseColumnIndices) > 0 and len(RefSpectraLibrarySparseMatrixEntries) > 0):