Bioinformatics
biolerplate-notes
Rna Seq

To download the sra

import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
 
# list = ["SRR1234567", "SRR1234568", "SRR1234569", "SRR1234570"]  # Replace with your list
 
successful = []
failed = []
 
def download_fastq(accession):
    try:
        result = subprocess.run(
            ["fastq-dump", "--split-3", "--gzip", accession],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        return accession, True, ""
    except subprocess.CalledProcessError as e:
        return accession, False, e.stderr
    except Exception as e:
        return accession, False, str(e)
 
with ThreadPoolExecutor(max_workers=6) as executor:
    future_to_accession = {executor.submit(download_fastq, acc): acc for acc in list}
 
    for idx, future in enumerate(as_completed(future_to_accession), 1):
        acc = future_to_accession[future]
        try:
            accession, success, message = future.result()
            print(f"[{idx}/{len(list)}] {'✓' if success else '✗'} {accession}")
            if success:
                successful.append(accession)
            else:
                print(f"  Error: {message}")
                failed.append(accession)
        except Exception as e:
            print(f"[{idx}/{len(list)}] ✗ Error in future for {acc}: {str(e)}")
            failed.append(acc)
 
# Summary
print("\n=== Download Summary ===")
print(f"Total: {len(list)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")
 
if failed:
    print("\nFailed accessions:")
    for acc in failed:
        print(f"- {acc}")

To run hisat alignment

# %%
import os

pairend_reads = []
singend_reads = []
file_list = []
for i in os.listdir("./data"):
    file_list.append(i)
    
for i in file_list:
    if len(i.split("_1")) > 1:
        pairend_reads.append(i)

for i in file_list:
    if len(i.split("_")) <= 1:
        singend_reads.append(i)

print("All the list of pairend reads")
print(pairend_reads)

print("\n\n All the list of single end reads")
print(singend_reads)

# %% [markdown]
# ```
# output:
# 
# All the list of pairend reads
# ['SRR17454416_1.fastq.gz', 'SRR17454418_1.fastq.gz', 'SRR17454413_1.fastq.gz']
# 
# 
#  All the list of single end reads
# ['SRR17454417.fastq.gz', 'SRR17454415.fastq.gz', 'SRR17454414.fastq.gz', 'SRR17454419.fastq.gz']
# ```

# %%
GTF_FILE="/workspace/workdisk3/rudhra/database/utils/grch38/Homo_sapiens.GRCh38.113.gtf"
HISAT2_INDEX="/workspace/workdisk3/rudhra/database/utils/grch38/genome" 
THREADS = 22
OUTPUT = "./aligned"
INPUT_DIR = "./data"

# %% [markdown]
# # To Align Pair end read

# %%
for i in pairend_reads:
    id = i.split("_")[0]
    
    
    # Ensure output directory exists
    os.makedirs(OUTPUT, exist_ok=True)

    # Command template
    command = (
        f"hisat2 -p {THREADS} -q --rna-strandness RF "
        f"-x {HISAT2_INDEX} "
        f"-1 {INPUT_DIR}/{id}_1.fastq.gz "
        f"-2 {INPUT_DIR}/{id}_2.fastq.gz "
        f"| samtools view -b "
        f"| samtools sort -o {OUTPUT}/{id}.bam"
    )

    print(command)
    os.system(command)
    

# %% [markdown]
# # To align single end read

# %%
for i in singend_reads:
    print(i)
    
    # Construct the command
    command = (
        f"hisat2 -p {THREADS} -q --rna-strandness F "
        f"-x {HISAT2_INDEX} "
        f"-U {INPUT_DIR}/{i} "
        f"| samtools view -b "
        f"| samtools sort -o {OUTPUT}/{id}.bam"
    )
    
    print(command)
    os.system(command)

To add gene name instad of ensembleid

featureCounts -a annotation.gtf -g gene_name -o counts.txt sample.bam

Use gene symbols directly during counting

If you haven't run featureCounts yet:

Run it with -g gene_name (assuming your GTF has that field):

featureCounts -a annotation.gtf -g gene_name -o counts_gene_names.txt sample.bam
  • -g gene_name tells featureCounts to use the gene_name attribute instead of the default gene_id.

This only works if your GTF file includes gene_name in the attributes column.

You can download GTFs with both fields from sources like:

  • Ensembl: Homo_sapiens.GRCh38.110.gtf.gz

  • GENCODE: gencode.v38.annotation.gtf.gz (recommended — includes both Ensembl IDs and gene names)