Bioinformatics
pipeline
Hisat2 Rna Seq

Run RNA seq using Hisat2 / DESeq2

Tis is the pipeline to generate featureCounts/ volcanoplots/ etc..., you might will have to change couple of parameters

Download the SRA Files

use script wich is present in boilerplate of the bioinformatincs blogs, it has multithreading to download the stuff faster.

  • download
  • unzip

To Run Hisat2

GTF_FILE="/workspace/workdisk3/rudhra/database/utils/grch38/Homo_sapiens.GRCh38.113.gtf"
HISAT2_INDEX="/workspace/workdisk3/rudhra/database/utils/grch38/genome" 
THREADS = 22
OUTPUT = "./aligned"
INPUT_DIR = "./data"

For pairend reads

 # Command template
    command = (
        f"hisat2 -p {THREADS} -q --rna-strandness RF "
        f"-x {HISAT2_INDEX} "
        f"-1 {INPUT_DIR}/{id}_1.fastq.gz "
        f"-2 {INPUT_DIR}/{id}_2.fastq.gz "
        f"| samtools view -b "
        f"| samtools sort -o {OUTPUT}/{id}.bam"
    )

    print(command)
    os.system(command)

For singleend reads

 command = (
        f"hisat2 -p {THREADS} -q --rna-strandness F "
        f"-x {HISAT2_INDEX} "
        f"-U {INPUT_DIR}/{i} "
        f"| samtools view -b "
        f"| samtools sort -o {OUTPUT}/{id}.bam"
    )
    
    print(command)
    os.system(command)

Generate featureCounts

use -p for pairend reads

featureCounts -a annotation.gtf -g gene_name -o counts_gene_names.txt *.bam

RStudio

preprocess the countmatrix

  • label it with proper name
  • label it control / treatment
  • merge the matrix
IF FACING ERRORS WITH RUNNING RSUTIO GO TO DOCKER STUDIO BLOGS
# Load required libraries
library(biomaRt)
 
# Step 1: Read count matrices
pairend_reads_data <- read.delim("./feature_counts/pairend.txt", comment.char = "#", header = TRUE, row.names = 1)
singleend_reads_data <- read.delim("./feature_counts/singleend.txt", comment.char = "#", header = TRUE, row.names = 1)
 
# Step 2: Extract count columns
count_column_pattern <- "\\.\\.aligned\\."
pairend_count_columns <- grep(count_column_pattern, colnames(pairend_reads_data), value = TRUE)
singleend_count_columns <- grep(count_column_pattern, colnames(singleend_reads_data), value = TRUE)
 
pairend_read_matrix <- pairend_reads_data[, pairend_count_columns]
singleend_reads_matrix <- singleend_reads_data[, singleend_count_columns]
 
# Step 3: Clean column names
clean_colnames <- function(names) {
  gsub("\\.\\.aligned\\.", "", names)
}
colnames(pairend_read_matrix) <- clean_colnames(colnames(pairend_read_matrix))
colnames(singleend_reads_matrix) <- clean_colnames(colnames(singleend_reads_matrix))
 
# Step 4: Define metadata
sample_metadata <- data.frame(
  Run = c("SRR17454413", "SRR17454414", "SRR17454415",
          "SRR17454416", "SRR17454417", "SRR17454418", "SRR17454419"),
  Condition = c("Control", "Control", "Control", "XLAG", "XLAG", "XLAG", "XLAG"),
  Layout = c("PAIRED", "SINGLE", "SINGLE", "PAIRED", "SINGLE", "PAIRED", "SINGLE"),
  Replicate = c(1, 2, 3, 1, 2, 2, 3),
  stringsAsFactors = FALSE
)
 
# Step 5: Generate readable sample names without layout tag
sample_metadata$SampleName <- paste0(sample_metadata$Condition, "_Rep", sample_metadata$Replicate)
 
# Step 6: Rename columns using metadata
rename_columns <- function(matrix, layout) {
  runs <- sample_metadata$Run[sample_metadata$Layout == layout]
  names <- sample_metadata$SampleName[sample_metadata$Layout == layout]
  new_colnames <- names[match(gsub("\\.bam$", "", colnames(matrix)), runs)]
  colnames(matrix) <- new_colnames
  return(matrix)
}
pairend_read_matrix <- rename_columns(pairend_read_matrix, "PAIRED")
singleend_reads_matrix <- rename_columns(singleend_reads_matrix, "SINGLE")
 
# Step 7: Convert Ensembl IDs to HGNC symbols
# Combine Ensembl IDs
all_ensembl_ids <- unique(c(rownames(pairend_read_matrix), rownames(singleend_reads_matrix)))
 
# Query biomaRt
ensembl <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
conversion <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  filters = "ensembl_gene_id",
  values = all_ensembl_ids,
  mart = ensembl
)
 
# Clean conversion table
conversion <- conversion[conversion$hgnc_symbol != "", ]
conversion <- conversion[!duplicated(conversion$ensembl_gene_id), ]
 
# Merge with matrices
pairend_read_matrix$ensembl_gene_id <- rownames(pairend_read_matrix)
merged_pe <- merge(conversion, pairend_read_matrix, by.x = "ensembl_gene_id", by.y = "ensembl_gene_id")
merged_pe$hgnc_symbol <- make.unique(merged_pe$hgnc_symbol)
rownames(merged_pe) <- merged_pe$hgnc_symbol
merged_pe <- merged_pe[, !(colnames(merged_pe) %in% c("ensembl_gene_id", "hgnc_symbol"))]
 
singleend_reads_matrix$ensembl_gene_id <- rownames(singleend_reads_matrix)
merged_se <- merge(conversion, singleend_reads_matrix, by.x = "ensembl_gene_id", by.y = "ensembl_gene_id")
merged_se$hgnc_symbol <- make.unique(merged_se$hgnc_symbol)
rownames(merged_se) <- merged_se$hgnc_symbol
merged_se <- merged_se[, !(colnames(merged_se) %in% c("ensembl_gene_id", "hgnc_symbol"))]
 
# Step 8: Combine matrices
combined_matrix <- cbind(merged_pe, merged_se)
 
# Step 9: Optional - Save output
# write.csv(combined_matrix, "combined_counts_matrix.csv")
# write.csv(sample_metadata, "sample_metadata.csv", row.names = FALSE)
 
# Step 10: Preview
head(combined_matrix)

For Downstream analysis using DESeq

colnames(combined_matrix)
# Check for duplicates
colnames(combined_matrix) <- make.unique(colnames(combined_matrix))
 
sample_info <- data.frame(
  sample = colnames(combined_matrix),
  condition = c("XLAG", "XLAG", "XLAG", "Control", "Control", "XLAG")
)
rownames(sample_info) <- sample_info$sample
 
 
library(DESeq2)
 
dds <- DESeqDataSetFromMatrix(
  countData = combined_matrix,
  colData = sample_info,
  design = ~ condition
)
 
 
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
 
 
dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "XLAG", "Control"))
 
plotMA(res, ylim=c(-5,5))
 
library(EnhancedVolcano)
EnhancedVolcano(res,
                lab = rownames(res),
                x = 'log2FoldChange',
                y = 'pvalue')
 
# Filter for significant genes (adjust thresholds as necessary)
res_sig <- res[which(res$padj < 0.05 & abs(res$log2FoldChange) > 1), ]
# Extract gene names for GO analysis
deg_genes <- rownames(res_sig)
 
library(clusterProfiler)
 
# Load required package
library(org.Hs.eg.db)
 
# Convert gene symbols to ENTREZ IDs
deg_entrez <- bitr(deg_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
 
# Load required package
library(org.Hs.eg.db)
 
# Convert gene symbols to ENTREZ IDs
deg_entrez <- bitr(deg_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)