diff --git a/README.md b/README.md index 8d52345..0a34979 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,70 @@ -# Environment Installation +## Visualize the integrated resources relationships as network diagram + +The following steps will create the output necessary to visualize the relationships among integrated resources and primary sources as a network diagram. In order to finish the figure, Cytoscape must be installed. Manual instructions to create the figure are included. + +### Create network where size of primary sources and aggregated DBs represent number of integrated resources that use them as mappings + +In Cytoscape: +1. Import network from file: Resource Interaction Table.xlsx, Sheet 1 (set as source, interaction, target) +2. Import table from file: Resource Interaction Table.xlsx, Sheet 2 (set as node, catergory) +3. Set node style, fill color, discrete mapping to unique colors for each category +5. Position integrated DB nodes in following order: mdad, gutmgene, gutmdisorder, disbiome, amadis, gimica, bugsigdb, dbbact, mikg4md, preprobiotickg, kg-microbe, biochem4j, unifuncnet +6. Remove labels of edges +7. Change label size to circle, inDegree, continuous mapping + a. Go to tools, analyze network, analyze as directed graph to change node size + b. Toggle with Continuous Mapping Editor for node size to make peak ~10 up to ~70-80 + c. Select integrated databases, set bypass for shape (rectangle) and size (15) +8. Only include edges between integrated DBs, aggregated DBs, and primary sources + a. Select all integrated db nodes, select - edges - select all edges, then select - nodes - deselect all nodes to remove edges + b. Select all aggregated db nodes, , select - edges - select all edges, then select - nodes - deselect all nodes to remove edges +9. Save figure as Network_sizeByDegree.svg + +### Create network where size of primary sources and aggregated DBs represent number of integrated resources that use them as mappings + +1. Run the following: +``` +cd ./scripts/ + +python db_expansion.py +Rscript integrated_db_plotting.R +Rscript collapse_categories.R +``` + +In Cytoscape: +2. Import network from file: ~/data/category_edges.tsv, Sheet 1 (set as source, interaction, target) +3. Import table from file: Resource Interaction Table.xlsx, Sheet 2 (set as node, catergory) +4. Align integrated db’s in order above categories + a. Select integrated db’s only, Layout Tools, align and distribute +5. Select all categories, set size to 100 +6. Change line width to 1, ensure no arrowhead is there (arrowhead will be added in AdobeIllustrator) +7. Save as Network_categories.svg + +### Create network with edges +In Adobe Illustrator: +1. Open Network_sizeByDegree.svg +2. Open Network_categories.svg + a. Update colors to chosen palette + b. For large category circles, make 50% opacity + c. Rotate rectangle, text for integrated DB rectangles +3. Change arrowhead to shape to edit colors + a. Add target arrowhead + b. Select same, fill & stroke + c. Object, path, outline stroke + +## Visualize the Reference Matrix + +The following code will create Figure 2b, the matrix of inegrated resource relationships. + +### Environment Installation ```bash mamba env create -f db_review.yml ``` -# Generate the Reference Matrix Visualization +### Generate the Reference Matrix Visualization ```bash snakemake --cores 1 ``` -## Child Database Expansion +#### Child Database Expansion The `db_expansion.py` script generates the edge distance between a given database `i` and all child databases that it references. An example case for WikiPathways is given below. ```mermaid @@ -37,8 +94,6 @@ erDiagram | WikiPathways | PubChem | 2 | | WikiPathways | GenBank | 2 | -## Reference Matrix Visualization +### Reference Matrix Visualization We then use our expanded reference table to hierarchically cluster the Source Databases (plotted along the y-axis) based off edge distance to the child nodes. ![alt text](./plots/db_edge_matrix_children.png "Database Links with Children") - - diff --git a/data/Resource Interaction Table.xlsx b/data/Resource Interaction Table.xlsx index 6d1448e..9ceda9f 100644 Binary files a/data/Resource Interaction Table.xlsx and b/data/Resource Interaction Table.xlsx differ diff --git a/scripts/collapse_categories.R b/scripts/collapse_categories.R index 70167a0..3a65716 100644 --- a/scripts/collapse_categories.R +++ b/scripts/collapse_categories.R @@ -1,8 +1,13 @@ -myedges <- read_xlsx('./data/Resource Interaction Table.xlsx') -mynodes <- read_xlsx('./data/Resource Interaction Table.xlsx', sheet = 2) +library(readxl) +library(tidyverse) + + +setwd('./') +myedges <- read_xlsx('../data/Resource Interaction Table.xlsx') +mynodes <- read_xlsx('../data/Resource Interaction Table.xlsx', sheet = 2) mynodes %>% - filter(category == 'Integrated DB') %>% + filter(category == 'Aggregate DB') %>% pull(node) -> idbs myedges %>% @@ -13,4 +18,4 @@ myedges %>% rename(target = category) %>% arrange(desc(source), desc(target)) -> category_edges -write_tsv(category_edges, './data/category_edges.tsv') +write_tsv(category_edges, '../data/category_edges.tsv') diff --git a/scripts/db_expansion.py b/scripts/db_expansion.py index 5c0494c..02b2c25 100644 --- a/scripts/db_expansion.py +++ b/scripts/db_expansion.py @@ -1,8 +1,7 @@ import networkx as nx import pandas as pd -myedges = pd.read_excel('./data/Resource Interaction Table.xlsx') -myedges = myedges[myedges['predicate'] != 'has construction method'] +myedges = pd.read_excel('../data/Resource Interaction Table.xlsx') G = nx.from_pandas_edgelist(myedges, create_using=nx.DiGraph()) @@ -15,7 +14,6 @@ for node in G.nodes(): if node not in leaves: tmp_desc = set(nx.descendants(G, node)) - #tmp_desc = leaves.intersection(tmp_desc) tmp_desc = list(tmp_desc) desc_dist = [nx.shortest_path_length(G, source=node, target=desc) for desc in tmp_desc] @@ -25,5 +23,5 @@ outdf.append(tmp_df) outdf = pd.concat(outdf) -outdf.to_csv('./data/expanded_edge_list.csv', +outdf.to_csv('../data/expanded_edge_list.csv', index=False) diff --git a/scripts/integrated_db_graph.R b/scripts/integrated_db_graph.R index cfef781..6e2ecbf 100644 --- a/scripts/integrated_db_graph.R +++ b/scripts/integrated_db_graph.R @@ -7,10 +7,6 @@ library(readxl) mydat <- read_xlsx('./data/Resource Interaction Table.xlsx') mynodes <- read_xlsx('./data/Resource Interaction Table.xlsx', sheet = 2) -mydat %>% - filter(predicate != 'has construction method') %>% - select(-predicate) -> mydat - mydat %>% group_by(target) %>% summarise(inDegree = n()) %>% @@ -19,7 +15,7 @@ mydat %>% mynodes %>% merge(inDegree, all = T) %>% mutate(inDegree = if_else(is.na(inDegree), 0, inDegree), - ISDB = as.character(category == 'Integrated DB')) -> mynodes + ISDB = as.character(category == 'Aggregate DB')) -> mynodes G <- graph_from_data_frame(mydat, directed = T, mynodes) diff --git a/scripts/integrated_db_plotting.R b/scripts/integrated_db_plotting.R index 91f20fa..4d8a701 100644 --- a/scripts/integrated_db_plotting.R +++ b/scripts/integrated_db_plotting.R @@ -3,64 +3,66 @@ library(readxl) library(stringr) library(tidyverse) -full_edges <- read_csv("./data/expanded_edge_list.csv") -orig_edges <- read_xlsx("./data/Resource Interaction Table.xlsx", sheet = 1) -orig_edges %>% - filter(predicate != "has construction method") -> orig_edges +setwd('./') +full_edges <- read_csv('../data/expanded_edge_list.csv') +orig_edges <- read_xlsx('../data/Resource Interaction Table.xlsx', sheet = 1) fill_deg <- max(full_edges$distance) + 1 -nodes <- read_xlsx("./data/Resource Interaction Table.xlsx", sheet = 2) -#nodes %>% -# mutate(category = str_replace(category, "\\/| ", "\n")) -> nodes +nodes <- read_xlsx('../data/Resource Interaction Table.xlsx', sheet = 2) -nodes %>% - filter(category == "Aggregated\nDB") %>% +nodes %>% + filter(category == 'General\nAggregate\nDB') %>% pull(node) -> reffed_idbs -full_edges %>% - pull(distance) %>% - unique() %>% +full_edges %>% + pull(distance) %>% + unique() %>% factor() -> distance_factor levels(distance_factor) <- rev(levels(distance_factor)) -category_sorted <- c("Microbe", "Protein", - "Metabolites", "Disease", - "Aggregated DB") +category_sorted <- c('Microbe', 'Protein', + 'Metabolites','Pathway','Disease', + 'General Aggregate DB') -full_edges %>% - select(source, target, distance) %>% +full_edges %>% + select(source, target, distance) %>% mutate(distance = as.numeric(distance), - distance = abs(distance - fill_deg)) %>% + distance = abs(distance - fill_deg)) %>% spread(target, distance, fill = 0) -> edge_mat -source_order <- hclust(dist(edge_mat[, -1]))$order -source_sorted <- edge_mat[source_order, 1]$source +source_order <- hclust(dist(edge_mat[,-1]))$order +source_sorted <- edge_mat[source_order,1]$source -full_edges %>% - merge(nodes, by.x = "target", by.y = "node") %>% +full_edges %>% + merge(nodes, by.x = 'target', by.y = 'node') %>% mutate(distance = factor(distance, levels = levels(distance_factor)), category = factor(category, levels = category_sorted), reffed_idbs = source %in% reffed_idbs, - source_f = factor(source, - levels = source_sorted)) -> plot_dat + source_f = factor(source, + levels = source_sorted)) -> plot_dat + +# New facet label names for category variable +category_labels <- c('Microbe'='Microbe', 'Protein'='Protein', + 'Metabolites'='Metabolite','Pathway'='PW','Disease'='Disease', + 'General Aggregate DB'='General Aggregate DB') plot_dat %>% - ggplot(aes(x = target, y = source_f, + ggplot(aes(x = target, y = source_f, fill = distance)) + - geom_tile(color = "black") + - facet_grid(~category, scales = "free", space = "free") + - theme_bw(base_size = 11) + + geom_tile(color = 'black') + + facet_grid(~category, scales = 'free', space = 'free',labeller = labeller(category = category_labels)) + + theme_bw(base_size = 11) + theme(axis.text.x = element_text(angle = 270 + 45, hjust = 0, vjust = 0.5)) + scale_fill_brewer(guide = guide_legend(reverse = TRUE)) + - labs(x = "Target DB", - y = "Source DB", - fill = "Reference Degree", - title = "Integrated Databases Links") -> db_viz_final + labs(x = 'Integrated Resource', + y = 'Primary Source', + fill = 'Reference Degree', + title = 'Primary Source Mappings of all Integrated Resources') -> db_viz_final -ggsave("./plots/db_edge_matrix_children.png", +ggsave('../db_viz_final.png', plot = db_viz_final, - width = 8, - height = 4) + width = 12, + height = 5)