Source code for MSIght.refactor_fragger_process

# -*- coding: utf-8 -*-
"""
Created on Fri Nov 15 17:19:57 2024

@author: lafields2
"""

import pandas as pd
import os

def process_fragger(protein_oi_list,ppm_error,psm_path,sized_he_image,output_path):
    """
    Processes FragPipe PSM reports to extract peptides and calculate error thresholds for MSI data integration.

    Parameters
    ----------
    protein_oi_list : list of str
        List of proteins of interest to filter from the PSM report.

    ppm_error : float
        PPM error tolerance for mass accuracy filtering.

    psm_path : str
        Path to the PSM report file (.tsv or .txt format).

    sized_he_image : numpy.ndarray
        Reference H&E image used for MSI data integration (not directly used in this function).

    output_path : str
        Directory where the processed results will be saved.

    Returns
    -------
    output_path_report : str
        Path to the saved CSV file containing the processed FragPipe results.

    Notes
    -----
    - Filters unique peptides from the PSM report.
    - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds.
    - Saves the processed data as a CSV file for MSIght integration.
    - If no matching proteins are found, the CSV will be empty.
    """
    psm_report = pd.read_table(psm_path)
    filtered_psm_report = psm_report[psm_report['Is Unique'] == True]
    mass_h = 1.00784
    prot_storage = []
    num_unique_pep_storage = []
    pep_storage = []
    pep_mass_storage = []
    pep_mz_z1_storage = []
    da_err_threshold_storage = []
    ppm_err_threshold_storage = []
    scan_storage = []
    for a in protein_oi_list:
        filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Protein ID'] == a]
        number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide']))
        calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass'])
        for line in range(0,len(calc_mass_filter)):
            peptide = calc_mass_filter['Peptide'].iloc[line]
            pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line]
            scan = calc_mass_filter['Spectrum'].iloc[line]
            da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass)
            da_err_equiv = round(da_err_equiv,2)
            da_error = pep_mass - da_err_equiv
            da_error = round(da_error,2)
            prot_storage.append(a)
            num_unique_pep_storage.append(number_unique_peptides)
            pep_storage.append(peptide)
            pep_mass_storage.append(pep_mass)
            pep_mz_z1_storage.append(mass_h + pep_mass)
            da_err_threshold_storage.append(da_error)
            ppm_err_threshold_storage.append(ppm_error)
            scan_storage.append(scan)
        fragger_results_summary = pd.DataFrame()
        fragger_results_summary['Protein Name'] = prot_storage
        fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage
        fragger_results_summary['Peptide'] = pep_storage
        fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage
        fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage
        fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage
        fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage
        fragger_results_summary['LC-MS/MS Scan'] = scan_storage
        #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv'
        output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv')
        fragger_results_summary.to_csv(output_path_report, index=False)
        return output_path_report

[docs]def process_fragger_gene(gene_oi_list,ppm_error,psm_path,sized_he_image,output_path):
    """
    Processes FragPipe PSM reports based on a list of genes of interest and calculates error thresholds 
    for mass spectrometry integration.

    Parameters
    ----------
    gene_oi_list : list of str
        List of genes of interest to filter from the PSM report.

    ppm_error : float
        PPM error tolerance for mass accuracy filtering.

    psm_path : str
        Path to the PSM report file (.tsv or .txt format).

    sized_he_image : numpy.ndarray
        Reference H&E image used for MSI data integration (not directly used in this function).

    output_path : str
        Directory where the processed results will be saved.

    Returns
    -------
    output_path_report : str
        Path to the saved CSV file containing the processed FragPipe results.

    Notes
    -----
    - Filters unique peptides from the PSM report based on the 'Gene' column.
    - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds.
    - Saves the processed data as a CSV file for MSIght integration.
    - If no matching genes are found, the CSV will be empty.
    """
    psm_report = pd.read_table(psm_path)
    filtered_psm_report = psm_report[psm_report['Is Unique'] == True]
    mass_h = 1.00784
    prot_storage = []
    num_unique_pep_storage = []
    pep_storage = []
    pep_mass_storage = []
    pep_mz_z1_storage = []
    da_err_threshold_storage = []
    ppm_err_threshold_storage = []
    scan_storage = []
    for a in gene_oi_list:
        filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Gene'] == a]
        number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide']))
        calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass'])
        for line in range(0,len(calc_mass_filter)):
            peptide = calc_mass_filter['Peptide'].iloc[line]
            pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line]
            scan = calc_mass_filter['Spectrum'].iloc[line]
            da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass)
            da_err_equiv = round(da_err_equiv,2)
            da_error = pep_mass - da_err_equiv
            da_error = round(da_error,2)
            prot_storage.append(a)
            num_unique_pep_storage.append(number_unique_peptides)
            pep_storage.append(peptide)
            pep_mass_storage.append(pep_mass)
            pep_mz_z1_storage.append(mass_h + pep_mass)
            da_err_threshold_storage.append(da_error)
            ppm_err_threshold_storage.append(ppm_error)
            scan_storage.append(scan)
        fragger_results_summary = pd.DataFrame()
        fragger_results_summary['Protein Name'] = prot_storage
        fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage
        fragger_results_summary['Peptide'] = pep_storage
        fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage
        fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage
        fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage
        fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage
        fragger_results_summary['LC-MS/MS Scan'] = scan_storage
        #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv'
        output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv')
        fragger_results_summary.to_csv(output_path_report, index=False)
        return output_path_report
    
[docs]def process_fragger(protein_oi_list,ppm_error,psm_path,sized_he_image,output_path):
    """
    Processes FragPipe PSM reports by filtering peptides based on proteins of interest 
    and calculating mass error thresholds for mass spectrometry integration.

    Parameters
    ----------
    protein_oi_list : list of str
        List of protein IDs of interest to filter from the PSM report.

    ppm_error : float
        PPM error tolerance for mass accuracy filtering.

    psm_path : str
        Path to the PSM report file (.tsv or .txt format).

    sized_he_image : numpy.ndarray
        Reference H&E image used for MSI data integration (not directly used in this function).

    output_path : str
        Directory where the processed results will be saved.

    Returns
    -------
    output_path_report : str
        Path to the saved CSV file containing the processed FragPipe results.

    Notes
    -----
    - Filters unique peptides from the PSM report based on the 'Protein ID' column.
    - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds.
    - Saves the processed data as a CSV file for MSIght integration.
    - If no matching proteins are found, the CSV will be empty.
    """
    psm_report = pd.read_table(psm_path)
    filtered_psm_report = psm_report[psm_report['Is Unique'] == True]
    mass_h = 1.00784
    prot_storage = []
    num_unique_pep_storage = []
    pep_storage = []
    pep_mass_storage = []
    pep_mz_z1_storage = []
    da_err_threshold_storage = []
    ppm_err_threshold_storage = []
    scan_storage = []
    for a in protein_oi_list:
        filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Protein ID'] == a]
        number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide']))
        calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass'])
        for line in range(0,len(calc_mass_filter)):
            peptide = calc_mass_filter['Peptide'].iloc[line]
            pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line]
            scan = calc_mass_filter['Spectrum'].iloc[line]
            da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass)
            da_err_equiv = round(da_err_equiv,2)
            da_error = pep_mass - da_err_equiv
            da_error = round(da_error,2)
            prot_storage.append(a)
            num_unique_pep_storage.append(number_unique_peptides)
            pep_storage.append(peptide)
            pep_mass_storage.append(pep_mass)
            pep_mz_z1_storage.append(mass_h + pep_mass)
            da_err_threshold_storage.append(da_error)
            ppm_err_threshold_storage.append(ppm_error)
            scan_storage.append(scan)
        fragger_results_summary = pd.DataFrame()
        fragger_results_summary['Protein Name'] = prot_storage
        fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage
        fragger_results_summary['Peptide'] = pep_storage
        fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage
        fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage
        fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage
        fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage
        fragger_results_summary['LC-MS/MS Scan'] = scan_storage
        #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv'
        output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv')
        fragger_results_summary.to_csv(output_path_report, index=False)
        return output_path_report
    
[docs]def global_proteomics_search(fragger_results_path,threshold,min_prot_instances,ppm_error,output_path):
    """
    Performs a global proteomics search on FragPipe results by filtering peptides based on mass differences 
    and minimum protein occurrences.

    Parameters
    ----------
    fragger_results_path : str
        Path to the FragPipe results file (.tsv or .txt format).

    threshold : float
        Mass difference threshold for filtering peptides with similar calculated masses.

    min_prot_instances : int
        Minimum number of instances a protein must appear in the PSM report for inclusion.

    ppm_error : float
        PPM error tolerance for mass accuracy filtering.

    output_path : str
        Directory where the processed results will be saved.

    Returns
    -------
    output_path_report : str
        Path to the saved CSV file containing the processed global proteomics search results.

    Notes
    -----
    - Filters peptides based on unique status and mass differences.
    - Excludes proteins with fewer than `min_prot_instances`.
    - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds.
    - Saves the filtered results as a CSV file.
    """
    psm_report = pd.read_table(fragger_results_path)
    filtered_psm_report = psm_report[psm_report['Is Unique'] == True]
    mass_h = 1.00784
    prot_storage = []
    num_unique_pep_storage = []
    pep_storage = []
    pep_mass_storage = []
    pep_mz_z1_storage = []
    da_err_threshold_storage = []
    ppm_err_threshold_storage = []
    scan_storage = []
    def filter_mass(filtered_psm_report):
        sorted_df = filtered_psm_report.sort_values(by='Calculated Peptide Mass')
        filtered_df = sorted_df[~sorted_df['Calculated Peptide Mass'].diff().between(-threshold, threshold) & 
                                ~sorted_df['Calculated Peptide Mass'].diff(-1).between(-threshold, threshold)]
        return filtered_df
    mass_dif_filtered_results = filter_mass(filtered_psm_report)
    filtered_df = mass_dif_filtered_results[mass_dif_filtered_results.groupby('Protein ID')['Protein ID'].transform('count') >= min_prot_instances]
    filtered_protein_list_df = filtered_df.drop_duplicates(subset='Protein ID')
    protein_oi_list = filtered_protein_list_df['Protein ID'].values.tolist()
    for a in protein_oi_list:
        filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Protein ID'] == a]
        number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide']))
        calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass'])
        for line in range(0,len(calc_mass_filter)):
            peptide = calc_mass_filter['Peptide'].iloc[line]
            pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line]
            scan = calc_mass_filter['Spectrum'].iloc[line]
            da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass)
            da_err_equiv = round(da_err_equiv,2)
            da_error = pep_mass - da_err_equiv
            da_error = round(da_error,2)
            prot_storage.append(a)
            num_unique_pep_storage.append(number_unique_peptides)
            pep_storage.append(peptide)
            pep_mass_storage.append(pep_mass)
            pep_mz_z1_storage.append(mass_h + pep_mass)
            da_err_threshold_storage.append(da_error)
            ppm_err_threshold_storage.append(ppm_error)
            scan_storage.append(scan)
    fragger_results_summary = pd.DataFrame()
    fragger_results_summary['Protein Name'] = prot_storage
    fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage
    fragger_results_summary['Peptide'] = pep_storage
    fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage
    fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage
    fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage
    fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage
    fragger_results_summary['LC-MS/MS Scan'] = scan_storage
    #output_path_report = output_path + '\\results_for_MSIght_untarget.csv'
    output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv')
    fragger_results_summary.to_csv(output_path_report, index=False)
    return output_path_report