diff --git a/src/modules/Neighborhood.py b/src/modules/Neighborhood.py new file mode 100644 index 0000000..7e25636 --- /dev/null +++ b/src/modules/Neighborhood.py @@ -0,0 +1,64 @@ +import shiny #Only needed if you are planning to use it later +import scanpy as sc +import umap + +adata = sc.read("C:/Users/pc/.ipython/HW3/Hw3covid_Data_AllCells.h5ad") + +#computing +def compute(adata, n_neighbors=10, n_pcs=40): + """_summary_ + + Args: + Hw3covid_Data_AllCells (_type_): _description_ + n_neighbors (int, optional): _description_. Defaults to 10. + n_pcs (int, optional): _description_. Defaults to 40. + """ + try: + sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs) + print("Neighbors computed successfully.") + except Exception as e: + print(f"Error in computing neighbors: {e}") + +#embedding +def embed(adata, color=['CST3', 'NKG7', 'PPBP']): + """_summary_ + + Args: + adata (_type_): _description_ + color (list, optional): _description_. Defaults to ['CST3', 'NKG7', 'PPBP']. + """ + try: + sc.pl.umap(adata, color=color) + sc.pl.umap(adata, color=color) # Note: This plots twice; check if you really want this + print("Embedding and plotting successful.") + except Exception as e: + print(f"Error in embedding: {e}") + +#clustering +def cluster(adata, color=['leiden', 'CST3', 'NKG7']): + """_summary_ + + Args: + adata (_type_): _description_ + color (list, optional): _description_. Defaults to ['leiden', 'CST3', 'NKG7']. + """ + try: + sc.tl.leiden(adata) + sc.pl.umap(adata, color=color) + print("Clustering and plotting successful.") + except Exception as e: + print(f"Error in clustering: {e}") + +#saving +def save(results_file, adata): + """_summary_ + + Args: + results_file (_type_): _description_ + adata (_type_): _description_ + """ + try: + adata.write(results_file) + print(f"Data saved successfully to {results_file}.") + except Exception as e: + print(f"Error in saving data: {e}") diff --git a/src/modules/Neighborhood_demo.py b/src/modules/Neighborhood_demo.py new file mode 100644 index 0000000..28ff284 --- /dev/null +++ b/src/modules/Neighborhood_demo.py @@ -0,0 +1,30 @@ +import scanpy as sc +import Neighborhood as nb # Your functions are here! +import umap + +#Load the dataset +print("Loading dataset...") +adata = sc.datasets.pbmc3k() +print(f"Dataset loaded! Number of cells: {adata.n_obs}, Number of genes: {adata.n_vars}\n") + +#Compute the neighbor graph +print("Computing the neighborhood graph...") +nb.compute(adata, n_neighbors=10, n_pcs=40) + +#Compute UMAP +print("Computing UMAP embedding...") +sc.tl.umap(adata) + +#Visualize the embedding +print("Plotting UMAP...") +genes_of_interest = ['CD3D', 'MS4A1', 'GNLY'] +nb.embed(adata, color=genes_of_interest) + +#Perform clustering +print("Performing Leiden clustering...") +nb.cluster(adata, color=['leiden'] + genes_of_interest) + +#Save the results +results_file = "pbmc3k_final_results.h5ad" +print(f"Saving results to {results_file}...") +nb.save(results_file, adata) diff --git a/src/modules/visualization.py b/src/modules/visualization.py index e69de29..5c49e97 100644 --- a/src/modules/visualization.py +++ b/src/modules/visualization.py @@ -0,0 +1,91 @@ +# Import libraries +import scanpy as sc +import matplotlib.pyplot as plt +import tempfile + +# PCA plot function +def pca_plot(adata, log=True, color='CST3'): + """_summary_ + + Args: + adata (_type_): _description_æ + log (bool, optional): _description_. Defaults to True. + color (str, optional): _description_. Defaults to 'CST3'. + + Raises: + ValueError: _description_ + """ + if adata is None: + raise ValueError("Input AnnData object (adata) is None.") + sc.pl.pca_variance_ratio(adata, log=log) + sc.pl.pca(adata, color=color) + +# Highly variable genes plot +def highvarGen_pl(adata, min_mean=0.0125, max_mean=3, min_disp=0.5): + """_summary_ + + Args: + adata (_type_): _description_ + min_mean (float, optional): _description_. Defaults to 0.0125. + max_mean (int, optional): _description_. Defaults to 3. + min_disp (float, optional): _description_. Defaults to 0.5. + + Raises: + ValueError: _description_ + ValueError: _description_ + """ + if adata is None: + raise ValueError("Input AnnData object (adata) is None.") + if "Cell type" not in adata.obs: + raise ValueError("'Cell type' not found in adata.obs. Please check your annotations.") + + sc.pl.umap(adata, color="Cell type", show=False) + sc.pl.highly_variable_genes(adata) + +# Violin plot function +def violin_pl(adata): + """_summary_ + + Args: + adata (_type_): _description_ + + Raises: + ValueError: _description_ + ValueError: _description_ + """ + if adata is None: + raise ValueError("Input AnnData object (adata) is None.") + if "Cell type" not in adata.obs: + raise ValueError("'Cell type' not found in adata.obs. Please check your annotations.") + + sc.pl.umap(adata, color="Cell type", show=False) + sc.pl.violin( + adata, + ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], + jitter=0.4, + multi_panel=True + ) + +# Scatter plot function +def scatter_plotting(adata, x='total_counts', y='pct_counts_mt'): + """_summary_ + + Args: + adata (_type_): _description_ + x (str, optional): _description_. Defaults to 'total_counts'. + y (str, optional): _description_. Defaults to 'pct_counts_mt'. + + Raises: + ValueError: _description_ + ValueError: _description_ + """ + if adata is None: + raise ValueError("Input AnnData object (adata) is None.") + if "Cell type" not in adata.obs: + raise ValueError("'Cell type' not found in adata.obs. Please check your annotations.") + + sc.pl.umap(adata, color="Cell type", show=False) + sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') + sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') + + diff --git a/src/modules/visualization_demo.py b/src/modules/visualization_demo.py new file mode 100644 index 0000000..c4e9229 --- /dev/null +++ b/src/modules/visualization_demo.py @@ -0,0 +1,132 @@ +import scanpy as sc +import matplotlib.pyplot as plt +import matplotlib +import os + +# === Plot helper === +def save_and_show(filename, folder='plots') -> None: + """ + Saves the current matplotlib plot to a file and displays it. + + Parameters: + filename (str): Name of the output image file + folder (str): Folder where the image will be saved (default is "plots"). + """ + if not os.path.exists(folder): + os.makedirs(folder) + plt.savefig(f"{folder}/{filename}") + plt.show() + +def load_file_demo(file_path: str) -> sc.AnnData: + """ + Loads a .h5ad file, applies basic quality control preprocessing, + and plots the top expressed genes. + + Steps: + - Reads file and saves a copy + - Plots highest expressed genes + - Filters cells and genes + - Annotates mitochondrial genes + - Calculates QC metrics + - Normalizes and log-transforms data + - Identifies highly variable genes + + Parameters: + file_path (str): Path to the .h5ad file + + Returns: + AnnData: Preprocessed AnnData object + """ + adata = sc.read_h5ad(file_path) + adata.write("adata_demo.h5ad") + print("Data saved as adata_demo.h5ad") + + print("Plotting highest expressed genes...") + sc.pl.highest_expr_genes(adata, n_top=20, show=False) + save_and_show("highest_expr_genes.png") + + print("Filtering cells and genes...") + sc.pp.filter_cells(adata, min_genes=200) + sc.pp.filter_genes(adata, min_cells=3) + + print("Annotating mitochondrial genes...") + adata.var['mt'] = adata.var_names.str.startswith('MT-') + + print("Calculating QC metrics...") + sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) + + print("Normalizing and log-transforming...") + sc.pp.normalize_total(adata, target_sum=1e4) + sc.pp.log1p(adata) + + print("Finding highly variable genes...") + sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) + + print("Preprocessing done.") + return adata + +def pca_plot(adata: sc.AnnData, log: bool = True, color: str = 'CST3') -> None: + """ + Performs PCA on the dataset and visualizes results. + + Parameters: + adata (AnnData): Preprocessed AnnData object + log (bool): Whether to log scale the variance ratio plot + color (str): Gene or metadata field to color PCA by + """ + sc.tl.pca(adata, svd_solver='arpack') + + sc.pl.pca_variance_ratio(adata, log=log, show=False) + save_and_show("pca_variance_ratio.png") + + sc.pl.pca(adata, color=color, show=False) + save_and_show("pca_plot.png") + +def highvarGen_pl(adata: sc.AnnData) -> None: + """ + Plots various visualizations related to highly variable genes and quality metrics. + + - UMAP by cell type (if available) + - Highly variable genes plot + - Violin plots for quality control stats + """ + if "Cell type" in adata.obs: + sc.pl.umap(adata, color="Cell type", show=False) + save_and_show("umap_cell_type.png") + else: + print("Error: 'Cell type' variable couldn't find in adata.obs.") + + sc.pl.highly_variable_genes(adata, show=False) + save_and_show("highly_variable_genes.png") + + sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0.4, multi_panel=True, show=False) + save_and_show("violin_qc.png") + +def scatter_plotting(adata: sc.AnnData) -> None: + """ + Generates scatter plots for common QC visualizations: + + - UMAP by cell type (if available) + - Total counts vs. mitochondrial percentage + - Total counts vs. number of genes + """ + if "Cell type" in adata.obs: + sc.pl.umap(adata, color="Cell type", show=False) + save_and_show("umap_cell_type_scatter.png") + else: + print("Error: 'Cell type' variable couldn't find in adata.obs.") + + sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', show=False) + save_and_show("scatter_total_vs_pct_mt.png") + + sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', show=False) + save_and_show("scatter_total_vs_n_genes.png") + +# === MAIN RUN === +file_path = r"C:\Users\gediz\Downloads\Hw3covid_Data_AllCells.h5ad" + +adata = load_file_demo(file_path) + +pca_plot(adata) +highvarGen_pl(adata) +scatter_plotting(adata) \ No newline at end of file