To download the sra
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
# list = ["SRR1234567", "SRR1234568", "SRR1234569", "SRR1234570"] # Replace with your list
successful = []
failed = []
def download_fastq(accession):
try:
result = subprocess.run(
["fastq-dump", "--split-3", "--gzip", accession],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
return accession, True, ""
except subprocess.CalledProcessError as e:
return accession, False, e.stderr
except Exception as e:
return accession, False, str(e)
with ThreadPoolExecutor(max_workers=6) as executor:
future_to_accession = {executor.submit(download_fastq, acc): acc for acc in list}
for idx, future in enumerate(as_completed(future_to_accession), 1):
acc = future_to_accession[future]
try:
accession, success, message = future.result()
print(f"[{idx}/{len(list)}] {'✓' if success else '✗'} {accession}")
if success:
successful.append(accession)
else:
print(f" Error: {message}")
failed.append(accession)
except Exception as e:
print(f"[{idx}/{len(list)}] ✗ Error in future for {acc}: {str(e)}")
failed.append(acc)
# Summary
print("\n=== Download Summary ===")
print(f"Total: {len(list)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")
if failed:
print("\nFailed accessions:")
for acc in failed:
print(f"- {acc}")
To run hisat alignment
# %%
import os
pairend_reads = []
singend_reads = []
file_list = []
for i in os.listdir("./data"):
file_list.append(i)
for i in file_list:
if len(i.split("_1")) > 1:
pairend_reads.append(i)
for i in file_list:
if len(i.split("_")) <= 1:
singend_reads.append(i)
print("All the list of pairend reads")
print(pairend_reads)
print("\n\n All the list of single end reads")
print(singend_reads)
# %% [markdown]
# ```
# output:
#
# All the list of pairend reads
# ['SRR17454416_1.fastq.gz', 'SRR17454418_1.fastq.gz', 'SRR17454413_1.fastq.gz']
#
#
# All the list of single end reads
# ['SRR17454417.fastq.gz', 'SRR17454415.fastq.gz', 'SRR17454414.fastq.gz', 'SRR17454419.fastq.gz']
# ```
# %%
GTF_FILE="/workspace/workdisk3/rudhra/database/utils/grch38/Homo_sapiens.GRCh38.113.gtf"
HISAT2_INDEX="/workspace/workdisk3/rudhra/database/utils/grch38/genome"
THREADS = 22
OUTPUT = "./aligned"
INPUT_DIR = "./data"
# %% [markdown]
# # To Align Pair end read
# %%
for i in pairend_reads:
id = i.split("_")[0]
# Ensure output directory exists
os.makedirs(OUTPUT, exist_ok=True)
# Command template
command = (
f"hisat2 -p {THREADS} -q --rna-strandness RF "
f"-x {HISAT2_INDEX} "
f"-1 {INPUT_DIR}/{id}_1.fastq.gz "
f"-2 {INPUT_DIR}/{id}_2.fastq.gz "
f"| samtools view -b "
f"| samtools sort -o {OUTPUT}/{id}.bam"
)
print(command)
os.system(command)
# %% [markdown]
# # To align single end read
# %%
for i in singend_reads:
print(i)
# Construct the command
command = (
f"hisat2 -p {THREADS} -q --rna-strandness F "
f"-x {HISAT2_INDEX} "
f"-U {INPUT_DIR}/{i} "
f"| samtools view -b "
f"| samtools sort -o {OUTPUT}/{id}.bam"
)
print(command)
os.system(command)
To add gene name instad of ensembleid
featureCounts -a annotation.gtf -g gene_name -o counts.txt sample.bam
Use gene symbols directly during counting
If you haven't run featureCounts yet:
Run it with -g gene_name (assuming your GTF has that field):
featureCounts -a annotation.gtf -g gene_name -o counts_gene_names.txt sample.bam
- -g gene_name tells featureCounts to use the gene_name attribute instead of the default gene_id.
This only works if your GTF file includes gene_name in the attributes column.
You can download GTFs with both fields from sources like:
-
Ensembl: Homo_sapiens.GRCh38.110.gtf.gz
-
GENCODE: gencode.v38.annotation.gtf.gz (recommended — includes both Ensembl IDs and gene names)