Run RNA seq using Hisat2 / DESeq2
Tis is the pipeline to generate featureCounts/ volcanoplots/ etc..., you might will have to change couple of parameters
Download the SRA Files
use script wich is present in boilerplate of the bioinformatincs blogs, it has multithreading to download the stuff faster.
- download
- unzip
To Run Hisat2
GTF_FILE="/workspace/workdisk3/rudhra/database/utils/grch38/Homo_sapiens.GRCh38.113.gtf"
HISAT2_INDEX="/workspace/workdisk3/rudhra/database/utils/grch38/genome"
THREADS = 22
OUTPUT = "./aligned"
INPUT_DIR = "./data"
For pairend reads
# Command template
command = (
f"hisat2 -p {THREADS} -q --rna-strandness RF "
f"-x {HISAT2_INDEX} "
f"-1 {INPUT_DIR}/{id}_1.fastq.gz "
f"-2 {INPUT_DIR}/{id}_2.fastq.gz "
f"| samtools view -b "
f"| samtools sort -o {OUTPUT}/{id}.bam"
)
print(command)
os.system(command)
For singleend reads
command = (
f"hisat2 -p {THREADS} -q --rna-strandness F "
f"-x {HISAT2_INDEX} "
f"-U {INPUT_DIR}/{i} "
f"| samtools view -b "
f"| samtools sort -o {OUTPUT}/{id}.bam"
)
print(command)
os.system(command)
Generate featureCounts
use -p
for pairend reads
featureCounts -a annotation.gtf -g gene_name -o counts_gene_names.txt *.bam
RStudio
preprocess the countmatrix
- label it with proper name
- label it control / treatment
- merge the matrix
IF FACING ERRORS WITH RUNNING RSUTIO GO TO DOCKER STUDIO BLOGS
# Load required libraries
library(biomaRt)
# Step 1: Read count matrices
pairend_reads_data <- read.delim("./feature_counts/pairend.txt", comment.char = "#", header = TRUE, row.names = 1)
singleend_reads_data <- read.delim("./feature_counts/singleend.txt", comment.char = "#", header = TRUE, row.names = 1)
# Step 2: Extract count columns
count_column_pattern <- "\\.\\.aligned\\."
pairend_count_columns <- grep(count_column_pattern, colnames(pairend_reads_data), value = TRUE)
singleend_count_columns <- grep(count_column_pattern, colnames(singleend_reads_data), value = TRUE)
pairend_read_matrix <- pairend_reads_data[, pairend_count_columns]
singleend_reads_matrix <- singleend_reads_data[, singleend_count_columns]
# Step 3: Clean column names
clean_colnames <- function(names) {
gsub("\\.\\.aligned\\.", "", names)
}
colnames(pairend_read_matrix) <- clean_colnames(colnames(pairend_read_matrix))
colnames(singleend_reads_matrix) <- clean_colnames(colnames(singleend_reads_matrix))
# Step 4: Define metadata
sample_metadata <- data.frame(
Run = c("SRR17454413", "SRR17454414", "SRR17454415",
"SRR17454416", "SRR17454417", "SRR17454418", "SRR17454419"),
Condition = c("Control", "Control", "Control", "XLAG", "XLAG", "XLAG", "XLAG"),
Layout = c("PAIRED", "SINGLE", "SINGLE", "PAIRED", "SINGLE", "PAIRED", "SINGLE"),
Replicate = c(1, 2, 3, 1, 2, 2, 3),
stringsAsFactors = FALSE
)
# Step 5: Generate readable sample names without layout tag
sample_metadata$SampleName <- paste0(sample_metadata$Condition, "_Rep", sample_metadata$Replicate)
# Step 6: Rename columns using metadata
rename_columns <- function(matrix, layout) {
runs <- sample_metadata$Run[sample_metadata$Layout == layout]
names <- sample_metadata$SampleName[sample_metadata$Layout == layout]
new_colnames <- names[match(gsub("\\.bam$", "", colnames(matrix)), runs)]
colnames(matrix) <- new_colnames
return(matrix)
}
pairend_read_matrix <- rename_columns(pairend_read_matrix, "PAIRED")
singleend_reads_matrix <- rename_columns(singleend_reads_matrix, "SINGLE")
# Step 7: Convert Ensembl IDs to HGNC symbols
# Combine Ensembl IDs
all_ensembl_ids <- unique(c(rownames(pairend_read_matrix), rownames(singleend_reads_matrix)))
# Query biomaRt
ensembl <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
conversion <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = all_ensembl_ids,
mart = ensembl
)
# Clean conversion table
conversion <- conversion[conversion$hgnc_symbol != "", ]
conversion <- conversion[!duplicated(conversion$ensembl_gene_id), ]
# Merge with matrices
pairend_read_matrix$ensembl_gene_id <- rownames(pairend_read_matrix)
merged_pe <- merge(conversion, pairend_read_matrix, by.x = "ensembl_gene_id", by.y = "ensembl_gene_id")
merged_pe$hgnc_symbol <- make.unique(merged_pe$hgnc_symbol)
rownames(merged_pe) <- merged_pe$hgnc_symbol
merged_pe <- merged_pe[, !(colnames(merged_pe) %in% c("ensembl_gene_id", "hgnc_symbol"))]
singleend_reads_matrix$ensembl_gene_id <- rownames(singleend_reads_matrix)
merged_se <- merge(conversion, singleend_reads_matrix, by.x = "ensembl_gene_id", by.y = "ensembl_gene_id")
merged_se$hgnc_symbol <- make.unique(merged_se$hgnc_symbol)
rownames(merged_se) <- merged_se$hgnc_symbol
merged_se <- merged_se[, !(colnames(merged_se) %in% c("ensembl_gene_id", "hgnc_symbol"))]
# Step 8: Combine matrices
combined_matrix <- cbind(merged_pe, merged_se)
# Step 9: Optional - Save output
# write.csv(combined_matrix, "combined_counts_matrix.csv")
# write.csv(sample_metadata, "sample_metadata.csv", row.names = FALSE)
# Step 10: Preview
head(combined_matrix)
For Downstream analysis using DESeq
colnames(combined_matrix)
# Check for duplicates
colnames(combined_matrix) <- make.unique(colnames(combined_matrix))
sample_info <- data.frame(
sample = colnames(combined_matrix),
condition = c("XLAG", "XLAG", "XLAG", "Control", "Control", "XLAG")
)
rownames(sample_info) <- sample_info$sample
library(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = combined_matrix,
colData = sample_info,
design = ~ condition
)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
res <- results(dds, contrast = c("condition", "XLAG", "Control"))
plotMA(res, ylim=c(-5,5))
library(EnhancedVolcano)
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue')
# Filter for significant genes (adjust thresholds as necessary)
res_sig <- res[which(res$padj < 0.05 & abs(res$log2FoldChange) > 1), ]
# Extract gene names for GO analysis
deg_genes <- rownames(res_sig)
library(clusterProfiler)
# Load required package
library(org.Hs.eg.db)
# Convert gene symbols to ENTREZ IDs
deg_entrez <- bitr(deg_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
# Load required package
library(org.Hs.eg.db)
# Convert gene symbols to ENTREZ IDs
deg_entrez <- bitr(deg_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)