import matplotlib.pyplot as plt
from matplotlib.transforms import Bbox
from pyseqrna import arg_parser
from pyseqrna import pyseqrna_utils as pu
from pyseqrna import quality_check as qc
from pyseqrna import quality_trimming as qt
from pyseqrna import aligners as al
from pyseqrna import pyseqrna_stats as ps
from pyseqrna import quantification as quant
from pyseqrna import differential_expression as de
from pyseqrna import pyseqrna_plots as pp
from pyseqrna import ribosomal as ribo
from pyseqrna import clustering as cl
from pyseqrna import multimapped_groups as mmg
from pyseqrna.gene_ontology import GeneOntology
from pyseqrna.pathway import Pathway
from pyseqrna.normalize_counts import Normalization
from pyseqrna import report
import math
import pyseqrna.version
import pandas as pd
import numpy as np
import time
import os, sys
from waiting import wait
log = pu.PyseqrnaLogger(mode="w", log="analysis")
log.info("Analysis started")
slurm = True
memory = 100
threads = 30
paired= False
fold = 2
fdr = 0.05
species='hvulgare'
speciestype = 'plants'
outdir = pu.make_directory("pyseqrna_paper_results")
reference_genome="./Hordeum_vulgare.MorexV3_pseudomolecules_assembly.dna.toplevel.fa"
feature_file = "./Hordeum_vulgare.MorexV3_pseudomolecules_assembly.52.gff3"
combination = ["GA-GE", "GB-GF", "GC-GG", "GD-GH", "GI-GM", "GJ-GN", "GK-GO", "GL-GP", "GQ-GU", "GR-GV", "GS-GW", "GT-GX",
"OA-OE", "OB-OF", "OC-OG", "OD-OH", "OI-OM", "OJ-ON", "OK-OO", "OL-OP", "OQ-OU", "OR-OV", "OS-OW", "OT-OX"]
input_data = pu.read_input_file(infile="targets_pyseqrna.txt", inpath="./input/")
samples = input_data['samples']
targets = input_data['targets']
combination = input_data['combinations']
qualitydir = pu.make_directory(os.path.join(outdir, "1_Quality"))
jobid, fastqcout = qc.fastqcRun(sampleDict=samples,outDir=qualitydir, slurm=slurm, mem=memory, cpu=threads, pairedEND=paired)
for job in jobid:
wait(lambda: pu.check_status(job), waiting_for="quality to finish")
log.info(f"Quality check completed for job {job}")
log.info("Read quality check completed succesfully")
outtrim, jobidt = qt.trim_galoreRun(sampleDict=samples,slurm=slurm,mem=memory,cpu=threads, outDir=qualitydir,paired=paired)
for job in jobidt:
wait(lambda: pu.check_status(job), waiting_for="trimming to finish")
log.info(f"Trimming completed for job {job}")
log.info("Read trimming completed succesfully")
log.info("Starting Alignment process")
aligndir = pu.make_directory(os.path.join(outdir, "2_Alignment"))
aligner= al.STAR_Aligner(genome=reference_genome, slurm=slurm, outDir=aligndir)
log.info("Genome indexing started")
jobida = aligner.build_index(mem=memory,cpu=threads)
wait(lambda: pu.check_status(jobida), waiting_for="genome indexing to finish")
log.info("Genome index completed succesfully")
if aligner.check_index():
log.info("Genome index is valid")
outalign, jobalign = aligner.run_Alignment(outtrim, pairedEND=paired, mem=memory, cpu=threads)
for job in jobalign:
wait(lambda: pu.check_status(job), waiting_for="alignment to finish")
log.info(f"Alignment completed for job {job}")
log.info("Read alignment completed succesfully")
align_stat = ps.align_stats(samples,outtrim, outalign,pairedEND=paired)
align_stat.to_excel(aligndir+"/alignment_statistics.xlsx", index=False)
log.info(f"alignment stats completed and written to {aligndir}/alignment_statistics.xlsx")
log.info("Feature Count from aligned reads started")
quantdir = pu.make_directory(os.path.join(outdir, "3_Quantification"))
fjob = quant.featureCount(gff=feature_file, bamDict=outalign, outDir=quantdir)
log.info("feature counts completed and written in %s/Raw_Counts.xlsx",quantdir)
log.info("Counting multimapped read groups")
mmg_count = mmg.countMMG(sampleDict=samples,bamDict=outalign,gff=feature_file, minCount=100, percentSample=0.5)
mmg_count.to_excel(os.path.join('pyseqrna_paper_results/3_Quantification',"Raw_MMGcounts.xlsx" ),index=False)
log.info("counting of multimapped read group finished")
diffdir = pu.make_directory(os.path.join(outdir, "4_Differential_Expression"))
count=pd.read_excel(os.path.join(quantdir,"Raw_Counts.xlsx"))
result = de.runDESeq2(countDF=count,targetFile=targets,design='sample', combination=combination, subset=False)
log.info("Differential expression analysis completed")
ge = de.Gene_Description(species=species, type=speciestype, degFile=result, filtered=False)
results = ge.add_names()
results.to_excel(os.path.join(diffdir,"All_gene_expression.xlsx"), index=False)
log.info(f"Filtering differential expressed genes based on logFC {fold} and FDR {fdr}")
filtered_DEG= de.degFilter(degDF=results, CompareList=combination,FDR=fdr, FOLD=fold, extraColumns=True)
log.info("filtering DEGs completed ")
log.info("writting filter DEGs combination wise to excel sheets")
# write up and down filtered genes together
wa = pd.ExcelWriter(os.path.join(diffdir,"Filtered_DEGs.xlsx"))
for key, value in filtered_DEG['filtered'].items():
value.to_excel(wa,sheet_name=key, index=False)
wa.save()
wa.close()
filtered_DEG['plot'].savefig(os.path.join(diffdir,"Filtered_DEG.png"),dpi=300, bbox_inches='tight')
plotdir = pu.make_directory(os.path.join(outdir, "5_Visualization"))
log.info("Creating heatmap of top 50 DEGs")
heatmap, ax = pp.plotHeatmap(result,combination,num=50, type='degs')
heatmap.savefig(os.path.join(plotdir,f"Heatmap_top50.png"), bbox_inches='tight')
genesdir = pu.getGenes(os.path.join(diffdir,"Filtered_DEGs.xlsx"),combinations=combination, outDir=diffdir)
annodir = pu.make_directory(os.path.join(outdir, "6_Functional_Annotation"))
outgo = pu.make_directory(os.path.join(annodir,"Gene_Ontology"))
gofiles = pu.make_directory(os.path.join(outgo,"GO_Files"))
goplots = pu.make_directory(os.path.join(outgo,"GO_Plots"))
go = GeneOntology(species=species, type=speciestype, keyType='ensembl')
for c in combination:
file_deg = f"{genesdir}/{c}.txt"
ontology_results = go.enrichGO(file=file_deg)
if ontology_results != "No Gene Ontology":
go_results = ge.add_names_annotation(ontology_results['result'])
go_results.to_excel(f"{gofiles}/{c}_gene_ontology.xlsx", index=False)
ontology_results['plot'].savefig(f"{goplots}/{c}_go_dotplot.png", bbox_inches='tight')
plt.close()
else:
log.info(f"No ontology found in {c}")
annodir = os.path.join(outdir, "6_Functional_Annotation")
outkegg = pu.make_directory(os.path.join(annodir,"KEGG_Pathway"))
keggfiles = pu.make_directory(os.path.join(outkegg,"KEGG_Files"))
keggplots = pu.make_directory(os.path.join(outkegg,"KEGG_Plots"))
pt = Pathway(species=species, keyType='ensembl')
ge = de.Gene_Description(species=species, type=speciestype, filtered=False)
for c in combination:
file_deg = f"{genesdir}/{c}.txt"
kegg_results = pt.enrichKEGG(file_deg)
if kegg_results != "No Pathway":
k_result = ge.add_names_annotation(kegg_results['result'])
k_result.to_excel(os.path.join(keggfiles, f"{c}_kegg.xlsx"), index=False)
kegg_results['plot'].savefig( f"{keggplots}/{c}_kegg_dotplot.png", bbox_inches='tight')
plt.close()
else:
log.info(f"No pathway found in {c}")