Source code for MSIght.refactor_fragger_process

# -*- coding: utf-8 -*-
"""
Created on Fri Nov 15 17:19:57 2024

@author: lafields2
"""

import pandas as pd
import os

def process_fragger(protein_oi_list,ppm_error,psm_path,sized_he_image,output_path):
    """
    Processes FragPipe PSM reports to extract peptides and calculate error thresholds for MSI data integration.

    Parameters
    ----------
    protein_oi_list : list of str
        List of proteins of interest to filter from the PSM report.

    ppm_error : float
        PPM error tolerance for mass accuracy filtering.

    psm_path : str
        Path to the PSM report file (.tsv or .txt format).

    sized_he_image : numpy.ndarray
        Reference H&E image used for MSI data integration (not directly used in this function).

    output_path : str
        Directory where the processed results will be saved.

    Returns
    -------
    output_path_report : str
        Path to the saved CSV file containing the processed FragPipe results.

    Notes
    -----
    - Filters unique peptides from the PSM report.
    - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds.
    - Saves the processed data as a CSV file for MSIght integration.
    - If no matching proteins are found, the CSV will be empty.
    """
    psm_report = pd.read_table(psm_path)
    filtered_psm_report = psm_report[psm_report['Is Unique'] == True]
    mass_h = 1.00784
    prot_storage = []
    num_unique_pep_storage = []
    pep_storage = []
    pep_mass_storage = []
    pep_mz_z1_storage = []
    da_err_threshold_storage = []
    ppm_err_threshold_storage = []
    scan_storage = []
    for a in protein_oi_list:
        filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Protein ID'] == a]
        number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide']))
        calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass'])
        for line in range(0,len(calc_mass_filter)):
            peptide = calc_mass_filter['Peptide'].iloc[line]
            pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line]
            scan = calc_mass_filter['Spectrum'].iloc[line]
            da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass)
            da_err_equiv = round(da_err_equiv,2)
            da_error = pep_mass - da_err_equiv
            da_error = round(da_error,2)
            prot_storage.append(a)
            num_unique_pep_storage.append(number_unique_peptides)
            pep_storage.append(peptide)
            pep_mass_storage.append(pep_mass)
            pep_mz_z1_storage.append(mass_h + pep_mass)
            da_err_threshold_storage.append(da_error)
            ppm_err_threshold_storage.append(ppm_error)
            scan_storage.append(scan)
        fragger_results_summary = pd.DataFrame()
        fragger_results_summary['Protein Name'] = prot_storage
        fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage
        fragger_results_summary['Peptide'] = pep_storage
        fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage
        fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage
        fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage
        fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage
        fragger_results_summary['LC-MS/MS Scan'] = scan_storage
        #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv'
        output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv')
        fragger_results_summary.to_csv(output_path_report, index=False)
        return output_path_report

[docs]def process_fragger_gene(gene_oi_list,ppm_error,psm_path,sized_he_image,output_path): """ Processes FragPipe PSM reports based on a list of genes of interest and calculates error thresholds for mass spectrometry integration. Parameters ---------- gene_oi_list : list of str List of genes of interest to filter from the PSM report. ppm_error : float PPM error tolerance for mass accuracy filtering. psm_path : str Path to the PSM report file (.tsv or .txt format). sized_he_image : numpy.ndarray Reference H&E image used for MSI data integration (not directly used in this function). output_path : str Directory where the processed results will be saved. Returns ------- output_path_report : str Path to the saved CSV file containing the processed FragPipe results. Notes ----- - Filters unique peptides from the PSM report based on the 'Gene' column. - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds. - Saves the processed data as a CSV file for MSIght integration. - If no matching genes are found, the CSV will be empty. """ psm_report = pd.read_table(psm_path) filtered_psm_report = psm_report[psm_report['Is Unique'] == True] mass_h = 1.00784 prot_storage = [] num_unique_pep_storage = [] pep_storage = [] pep_mass_storage = [] pep_mz_z1_storage = [] da_err_threshold_storage = [] ppm_err_threshold_storage = [] scan_storage = [] for a in gene_oi_list: filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Gene'] == a] number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide'])) calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass']) for line in range(0,len(calc_mass_filter)): peptide = calc_mass_filter['Peptide'].iloc[line] pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line] scan = calc_mass_filter['Spectrum'].iloc[line] da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass) da_err_equiv = round(da_err_equiv,2) da_error = pep_mass - da_err_equiv da_error = round(da_error,2) prot_storage.append(a) num_unique_pep_storage.append(number_unique_peptides) pep_storage.append(peptide) pep_mass_storage.append(pep_mass) pep_mz_z1_storage.append(mass_h + pep_mass) da_err_threshold_storage.append(da_error) ppm_err_threshold_storage.append(ppm_error) scan_storage.append(scan) fragger_results_summary = pd.DataFrame() fragger_results_summary['Protein Name'] = prot_storage fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage fragger_results_summary['Peptide'] = pep_storage fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage fragger_results_summary['LC-MS/MS Scan'] = scan_storage #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv' output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv') fragger_results_summary.to_csv(output_path_report, index=False) return output_path_report
[docs]def process_fragger(protein_oi_list,ppm_error,psm_path,sized_he_image,output_path): """ Processes FragPipe PSM reports by filtering peptides based on proteins of interest and calculating mass error thresholds for mass spectrometry integration. Parameters ---------- protein_oi_list : list of str List of protein IDs of interest to filter from the PSM report. ppm_error : float PPM error tolerance for mass accuracy filtering. psm_path : str Path to the PSM report file (.tsv or .txt format). sized_he_image : numpy.ndarray Reference H&E image used for MSI data integration (not directly used in this function). output_path : str Directory where the processed results will be saved. Returns ------- output_path_report : str Path to the saved CSV file containing the processed FragPipe results. Notes ----- - Filters unique peptides from the PSM report based on the 'Protein ID' column. - Calculates theoretical mass-to-charge ratios (m/z) and PPM-based error thresholds. - Saves the processed data as a CSV file for MSIght integration. - If no matching proteins are found, the CSV will be empty. """ psm_report = pd.read_table(psm_path) filtered_psm_report = psm_report[psm_report['Is Unique'] == True] mass_h = 1.00784 prot_storage = [] num_unique_pep_storage = [] pep_storage = [] pep_mass_storage = [] pep_mz_z1_storage = [] da_err_threshold_storage = [] ppm_err_threshold_storage = [] scan_storage = [] for a in protein_oi_list: filtered_protein_psm_report = filtered_psm_report[filtered_psm_report['Protein ID'] == a] number_unique_peptides = len(filtered_protein_psm_report.drop_duplicates(subset=['Peptide'])) calc_mass_filter = filtered_protein_psm_report.drop_duplicates(subset=['Calculated Peptide Mass']) for line in range(0,len(calc_mass_filter)): peptide = calc_mass_filter['Peptide'].iloc[line] pep_mass = calc_mass_filter['Calculated Peptide Mass'].iloc[line] scan = calc_mass_filter['Spectrum'].iloc[line] da_err_equiv = abs(((ppm_error / 1000000) * pep_mass) - pep_mass) da_err_equiv = round(da_err_equiv,2) da_error = pep_mass - da_err_equiv da_error = round(da_error,2) prot_storage.append(a) num_unique_pep_storage.append(number_unique_peptides) pep_storage.append(peptide) pep_mass_storage.append(pep_mass) pep_mz_z1_storage.append(mass_h + pep_mass) da_err_threshold_storage.append(da_error) ppm_err_threshold_storage.append(ppm_error) scan_storage.append(scan) fragger_results_summary = pd.DataFrame() fragger_results_summary['Protein Name'] = prot_storage fragger_results_summary['# Unique Peptides'] = num_unique_pep_storage fragger_results_summary['Peptide'] = pep_storage fragger_results_summary['Peptide Theoretical Mass'] = pep_mass_storage fragger_results_summary['Peptide Theoretical m/z (+1)'] = pep_mz_z1_storage fragger_results_summary['ppm Error Threshold'] = ppm_err_threshold_storage fragger_results_summary['Calc. Da Error Threshold'] = da_err_threshold_storage fragger_results_summary['LC-MS/MS Scan'] = scan_storage #output_path_report = output_path + '\\results_for_MSIght_other2Col.csv' output_path_report = os.path.join(output_path,'results_for_MSIght_other2Col.csv') fragger_results_summary.to_csv(output_path_report, index=False) return output_path_report