Skip to content
Victor Lin edited this page May 21, 2024 · 20 revisions

Lambda setup

The contents of this repo are tailored for usage via AWS Lambda.

  1. Set up the AWS Lambda functions

     bash src/extract_attributes/lambda-init.sh
    
  2. Create the layers (environments) necessary to run the code

     bash src/extract_attributes/create-layer.sh
    
  3. Upload the code to the functions

     bash src/extract_attributes/deploy.sh
    

Aurora

upload steps:

aws rds modify-current-db-cluster-capacity \
--db-cluster-identifier serratus-aurora \
--capacity 32 \
--seconds-before-timeout 10

watch 'aws rds describe-db-clusters --db-cluster-identifier serratus-aurora | grep Capacity'
truncate biosample3
alter table biosample3 add primary key (biosample_id);

SQL

  • biosample2: add testing for exclusions which caught some false positives/negatives
  • biosample3: artem's rules
  • biosample4: disable exclusion rules (show all FPs)
  • biosample5: add attributes column

some interesting queries:

-- maximum potential coverage
select x.count::decimal/y.count
from (
    select count(*) from biosample2
    where not (geo_coord_extracted is null and geo_text_extracted is null)
) x
join (
    select count(*) from biosample2
) y on 1=1

-- # biosamples to be geocoded
select count(*) from biosample2
where geo_coord_extracted is null and geo_text_extracted is not null

-- distinct with counts
select geo_text_extracted, count(*)
from biosample2
where geo_text_extracted is not null
group by geo_text_extracted
order by count(*) desc
limit 10000

-- count distinct - should be # rows of query above
select count(distinct geo_text_extracted) from biosample2


-- json stuff

select count(*), cardinality(keys), keys
from (
	select array(select jsonb_object_keys(geo_text_all)) as keys
	from biosample3
) as subquery
group by keys
order by count(*) desc

-- keys
select *, jsonb_object_keys(geo_coord_all) from biosample2

-- specific key
select * from biosample2
where geo_coord_all->>'lat_lon' is not null

-- keys filtered by count (cardinality)
select geo_text_all
from (
	select geo_text_all, array(select jsonb_object_keys(geo_text_all)) as keys
	from biosample4
) as subquery
where cardinality(keys) > 3
limit 100

-- individual key counts
SELECT
  geo_coord_key, count(*)
FROM (
  SELECT jsonb_object_keys(geo_coord_all) AS geo_coord_key
  FROM biosample2
) AS subquery
GROUP BY
  geo_coord_key
ORDER BY COUNT(*) DESC

curation of manual key exclusion list

  1. get key counts

    SELECT
    geo_coord_key, count(*)
    FROM (
    SELECT jsonb_object_keys(geo_text_all) AS geo_coord_key
    FROM biosample4
    ) AS subquery
    GROUP BY
    geo_coord_key
    ORDER BY COUNT(*) DESC
  2. go through and validate false positives

    select * from biosample4
    where geo_text_all::text like '%tumour%'
    limit 1000

shell stuff

# copy biosample_set.xml.gz
wget -qO- https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz | aws s3 cp - s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml.gz

# unzip to biosample_set.xml
aws s3 cp s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml.gz - | zcat | aws s3 cp - s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml


# preview XML head
zcat < biosample_set.xml.gz | head -n 100000 > head.xml

# find occurrences of latitude_and_longitude in XML
aws s3 cp s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml - | grep --line-buffered latitude_and_longitude

TODO

Clone this wiki locally