import os
import pandas as pd
import numpy as np
import random
import scipy as scp
import warnings
'ignore') warnings.filterwarnings(
Frequently used statistical concepts in Bioinformatics
# # Install packages. Uncomment to install and comment back after installing
# %pip install hmmlearn
# %pip install lifelines
In bioinformatics, statistical concepts are pivotal for analyzing and interpreting complex biological data. Below are ten fundamental statistical concepts, each defined with explanations, mathematical formulations, and Python code examples to illustrate their applications.
1. Probability Theory and Bayesβ Theorem
Definition: Probability theory quantifies the likelihood of events occurring. Bayesβ Theorem provides a way to update the probability of a hypothesis based on new evidence.
Mathematical Formulation: π ( π΄ β£ π΅ ) = π ( π΅ β£ π΄ ) Γ π ( π΄ ) π ( π΅ ) P(Aβ£B)= P(B) P(Bβ£A)ΓP(A)βWhere:
π ( π΄ β£ π΅ ) P(Aβ£B) is the posterior probability of event A given B. π ( π΅ β£ π΄ ) P(Bβ£A) is the likelihood of event B given A. π ( π΄ ) P(A) and π ( π΅ ) P(B) are the prior probabilities of events A and B, respectively.
Example: In genomics, determining the probability of a disease given a genetic marker involves updating prior knowledge with new genetic data
# Calculating posterior probability using Bayes' Theorem
def bayes_theorem(prior_A, likelihood_B_given_A, prior_B):
return (likelihood_B_given_A * prior_A) / prior_B
# Example values
= 0.01 # Prior probability of disease
prior_disease = 0.9 # P(Test positive | Disease)
sensitivity = 0.95 # P(Test negative | No Disease)
specificity = 1 - prior_disease
prior_no_disease = 1 - specificity
false_positive_rate
# P(Test positive)
= (sensitivity * prior_disease) + (false_positive_rate * prior_no_disease)
prior_test_positive
# P(Disease | Test positive)
= bayes_theorem(prior_disease, sensitivity, prior_test_positive)
posterior_disease_given_positive print(f"Posterior probability of disease given a positive test: {posterior_disease_given_positive:.4f}")
Posterior probability of disease given a positive test: 0.1538
2. Hypothesis Testing
Definition: A statistical method to determine if there is enough evidence to reject a null hypothesis in favor of an alternative hypothesis.
Mathematical Formulation:
Null Hypothesis (π»0): Assumes no effect or difference.
Alternative Hypothesis (π»1): Assumes an effect or difference exists.
Test Statistic: A value calculated from sample data used to decide whether to reject π»0.
p-value: The probability of obtaining a test statistic at least as extreme as the one observed, assuming π»0 is true.
Example: Testing whether a new drug affects gene expression levels compared to a control.
import numpy as np
from scipy import stats
# Sample data: gene expression levels
= np.array([5.1, 5.3, 5.5, 5.7, 5.9])
control = np.array([5.8, 6.0, 6.2, 6.4, 6.6])
treatment
# Perform two-sample t-test
= stats.ttest_ind(treatment, control)
t_stat, p_value print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
# Interpret the result
= 0.05
alpha if p_value < alpha:
print("Reject the null hypothesis: Significant difference between groups.")
else:
print("Fail to reject the null hypothesis: No significant difference between groups.")
T-statistic: 3.5000, p-value: 0.0081
Reject the null hypothesis: Significant difference between groups.
3. Regression Analysis
Definition: A set of statistical processes for estimating relationships among variables.
Mathematical Formulation:
Linear Regression Model:
π = π½ 0 + π½ 1 π + π Y=Ξ² 0 β +Ξ² 1 β X+Ο΅ π
Y: Dependent variable
X: Independent variable
Ξ² 0 β : Intercept
Ξ² 1 β : Slope
Ο΅: Error term
Example: Predicting protein concentration based on gene expression levels.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Sample data: gene expression (X) and protein concentration (Y)
= np.array([1, 2, 3, 4, 5]).reshape(-1, 1)
X = np.array([1.2, 1.9, 3.1, 3.9, 5.1])
Y
# Create and fit the model
= LinearRegression()
model
model.fit(X, Y)
# Coefficients
= model.intercept_
intercept = model.coef_[0]
slope print(f"Intercept: {intercept:.2f}, Slope: {slope:.2f}")
# Predict and plot
= model.predict(X)
Y_pred ='blue', label='Actual data')
plt.scatter(X, Y, color='red', label='Fitted line')
plt.plot(X, Y_pred, color'Gene Expression')
plt.xlabel('Protein Concentration')
plt.ylabel(
plt.legend() plt.show()
Intercept: 0.10, Slope: 0.98
import pandas as pd
import numpy as np
from scipy import stats
# Sample data: gene expression levels in different tissues
= {
data 'Tissue': ['Liver', 'Liver', 'Liver', 'Heart', 'Heart', 'Heart', 'Brain', 'Brain', 'Brain'],
'Expression': [5.1, 5.3, 5.5, 6.1, 6.3, 6.5, 7.1, 7.3, 7.5]
}= pd.DataFrame(data)
df
# Perform one-way ANOVA
= df[df['Tissue'] == 'Liver']['Expression']
liver = df[df['Tissue'] == 'Heart']['Expression']
heart = df[df['Tissue'] == 'Brain']['Expression']
brain
= stats.f_oneway(liver, heart, brain)
f_stat, p_val print(f"F-statistic: {f_stat:.4f}, p-value: {p_val:.4f}")
# Interpret the result
= 0.05
alpha if p_val < alpha:
print("Reject the null hypothesis: Significant differences exist between tissue groups.")
else:
print("Fail to reject the null hypothesis: No significant differences between tissue groups.")
F-statistic: 75.0000, p-value: 0.0001
Reject the null hypothesis: Significant differences exist between tissue groups.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Sample data: gene expression levels (rows: genes, columns: samples)
= np.array([
data 2.5, 2.4],
[0.5, 0.7],
[2.2, 2.9],
[1.9, 2.2],
[3.1, 3.0],
[2.3, 2.7],
[2.0, 1.6],
[1.0, 1.1],
[1.5, 1.6],
[1.1, 0.9]
[
])
# Standardize the data
= StandardScaler()
scaler = scaler.fit_transform(data)
data_std
# Perform PCA
= PCA(n_components=2)
pca = pca.fit_transform(data_std)
principal_components
# Plot the results
0], principal_components[:, 1], c='blue')
plt.scatter(principal_components[:, 'Principal Component 1')
plt.xlabel('Principal Component 2')
plt.ylabel('PCA of Gene Expression Data')
plt.title( plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Sample data: gene expression levels
= np.array([
data 1.0, 2.0],
[1.5, 1.8],
[5.0, 8.0],
[8.0, 8.0],
[1.0, 0.6],
[9.0, 11.0],
[8.0, 2.0],
[10.0, 2.0],
[9.0, 3.0]
[
])
# Perform K-means clustering
= KMeans(n_clusters=3)
kmeans
kmeans.fit(data)= kmeans.labels_
labels = kmeans.cluster_centers_
centroids
# Plot the results
= ['r', 'g', 'b']
colors for i in range(len(data)):
0], data[i][1], c=colors[labels[i]], s=30)
plt.scatter(data[i][0], centroids[:, 1], marker='x', s=100, c='black')
plt.scatter(centroids[:, 'Gene Expression Feature 1')
plt.xlabel('Gene Expression Feature 2')
plt.ylabel('K-means Clustering of Gene Expression Data')
plt.title( plt.show()
# Define states and transition matrix
= ['A', 'C', 'G', 'T']
states = {
transition_matrix 'A': {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3},
'C': {'A': 0.1, 'C': 0.4, 'G': 0.4, 'T': 0.1},
'G': {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2},
'T': {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25},
}
# Generate a Markov sequence
def generate_markov_sequence(length, start_state='A'):
= [start_state]
sequence = start_state
current_state for _ in range(length - 1):
= random.choices(
next_state =states,
population=[transition_matrix[current_state][s] for s in states]
weights0]
)[
sequence.append(next_state)= next_state
current_state return ''.join(sequence)
# Generate a sequence of length 50 starting with 'A'
= generate_markov_sequence(50, start_state='A')
markov_sequence print("Generated Markov Sequence:", markov_sequence)
Generated Markov Sequence: AAATCGTGTTTAATCGGCGACCGCCGTATCCCCGCCTGACGTTGGGAATG
import logging
# Set logging level to suppress informational messages
logging.getLogger().setLevel(logging.ERROR)
# Your code here
from hmmlearn import hmm
# Define the HMM
= hmm.MultinomialHMM(n_components=2, n_iter=100, tol=0.01)
model
# Encoding Exon and Intron states as 0 and 1
# Assume 'A', 'C', 'G', 'T' as observations (encoded as 0, 1, 2, 3)
= ['Exon', 'Intron']
states = ['A', 'C', 'G', 'T']
observations
# Transition probability matrix for Exon and Intron
# High self-transition probabilities to simulate longer sequences
= np.array([0.5, 0.5]) # Start with equal probability
model.startprob_ = np.array([
model.transmat_ 0.8, 0.2], # Exon to Exon, Exon to Intron
[0.2, 0.8] # Intron to Exon, Intron to Intron
[
])
# Emission probability matrix for Exon and Intron
# Exons may have slightly different nucleotide distribution
= np.array([
model.emissionprob_ 0.25, 0.25, 0.25, 0.25], # Equal for simplicity in exons
[0.1, 0.4, 0.4, 0.1] # Higher C/G content in introns
[
])
# Generate a sample sequence (observations encoded as integers)
= np.array([[0, 1, 2, 3, 2, 1, 0, 2, 3, 0]]).T # Sample DNA sequence as 'A', 'C', 'G', 'T'
sequence
# Fit the model to the sequence and predict the hidden states
= model.fit(sequence)
model = model.predict(sequence)
hidden_states
# Decode and print the hidden states
= [states[state] for state in hidden_states]
decoded_states print("Observed Sequence: ", ''.join([observations[i[0]] for i in sequence]))
print("Predicted Hidden States:", decoded_states)
Observed Sequence: ACGTGCAGTA
Predicted Hidden States: ['Intron', 'Exon', 'Intron', 'Exon', 'Intron', 'Exon', 'Intron', 'Exon', 'Intron', 'Exon']
import numpy as np
from statsmodels.stats.multitest import multipletests
# Sample p-values from multiple tests
= np.array([0.01, 0.04, 0.03, 0.05, 0.20, 0.001, 0.15, 0.005])
p_values
# Apply Benjamini-Hochberg correction for False Discovery Rate (FDR)
= multipletests(p_values, alpha=0.05, method='fdr_bh')
reject, pvals_corrected, _, _
# Results
for i, (p_val, p_corr, rej) in enumerate(zip(p_values, pvals_corrected, reject)):
print(f"Test {i+1}: Original p-value = {p_val:.3f}, Corrected p-value = {p_corr:.3f}, Reject null hypothesis: {rej}")
Test 1: Original p-value = 0.010, Corrected p-value = 0.027, Reject null hypothesis: True
Test 2: Original p-value = 0.040, Corrected p-value = 0.064, Reject null hypothesis: False
Test 3: Original p-value = 0.030, Corrected p-value = 0.060, Reject null hypothesis: False
Test 4: Original p-value = 0.050, Corrected p-value = 0.067, Reject null hypothesis: False
Test 5: Original p-value = 0.200, Corrected p-value = 0.200, Reject null hypothesis: False
Test 6: Original p-value = 0.001, Corrected p-value = 0.008, Reject null hypothesis: True
Test 7: Original p-value = 0.150, Corrected p-value = 0.171, Reject null hypothesis: False
Test 8: Original p-value = 0.005, Corrected p-value = 0.020, Reject null hypothesis: True
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
# Sample data: survival times and event occurrences
= {
data 'Time': [5, 6, 6, 2, 4, 4, 3, 5, 8, 6],
'Event': [1, 0, 1, 1, 0, 1, 0, 1, 1, 0]
}= pd.DataFrame(data)
df
# Initialize the Kaplan-Meier fitter
= KaplanMeierFitter()
kmf
# Fit the data
=df['Time'], event_observed=df['Event'])
kmf.fit(durations
# Plot the survival function
kmf.plot_survival_function()'Survival Function')
plt.title('Time')
plt.xlabel('Survival Probability')
plt.ylabel( plt.show()
import networkx as nx
import matplotlib.pyplot as plt
# Create a sample directed graph representing a biological pathway
= nx.DiGraph()
G
# Add nodes (genes/proteins)
= ['GeneA', 'GeneB', 'GeneC', 'GeneD', 'GeneE']
nodes
G.add_nodes_from(nodes)
# Add edges (interactions)
= [('GeneA', 'GeneB'), ('GeneB', 'GeneC'), ('GeneC', 'GeneD'), ('GeneD', 'GeneE'), ('GeneA', 'GeneC')]
edges
G.add_edges_from(edges)
# Draw the network
= nx.spring_layout(G)
pos =True, node_color='lightblue', edge_color='gray', node_size=2000, font_size=10, font_weight='bold')
nx.draw(G, pos, with_labels'Biological Pathway Network')
plt.title( plt.show()