-
Notifications
You must be signed in to change notification settings - Fork 0
notes
Victor Lin edited this page May 21, 2024
·
20 revisions
The contents of this repo are tailored for usage via AWS Lambda.
-
Set up the AWS Lambda functions
bash src/extract_attributes/lambda-init.sh -
Create the layers (environments) necessary to run the code
bash src/extract_attributes/create-layer.sh -
Upload the code to the functions
bash src/extract_attributes/deploy.sh
upload steps:
aws rds modify-current-db-cluster-capacity \
--db-cluster-identifier serratus-aurora \
--capacity 32 \
--seconds-before-timeout 10
watch 'aws rds describe-db-clusters --db-cluster-identifier serratus-aurora | grep Capacity'
truncate biosample3
alter table biosample3 add primary key (biosample_id);- biosample2: add testing for exclusions which caught some false positives/negatives
- biosample3: artem's rules
- biosample4: disable exclusion rules (show all FPs)
- biosample5: add
attributescolumn
some interesting queries:
-- maximum potential coverage
select x.count::decimal/y.count
from (
select count(*) from biosample2
where not (geo_coord_extracted is null and geo_text_extracted is null)
) x
join (
select count(*) from biosample2
) y on 1=1
-- # biosamples to be geocoded
select count(*) from biosample2
where geo_coord_extracted is null and geo_text_extracted is not null
-- distinct with counts
select geo_text_extracted, count(*)
from biosample2
where geo_text_extracted is not null
group by geo_text_extracted
order by count(*) desc
limit 10000
-- count distinct - should be # rows of query above
select count(distinct geo_text_extracted) from biosample2
-- json stuff
select count(*), cardinality(keys), keys
from (
select array(select jsonb_object_keys(geo_text_all)) as keys
from biosample3
) as subquery
group by keys
order by count(*) desc
-- keys
select *, jsonb_object_keys(geo_coord_all) from biosample2
-- specific key
select * from biosample2
where geo_coord_all->>'lat_lon' is not null
-- keys filtered by count (cardinality)
select geo_text_all
from (
select geo_text_all, array(select jsonb_object_keys(geo_text_all)) as keys
from biosample4
) as subquery
where cardinality(keys) > 3
limit 100
-- individual key counts
SELECT
geo_coord_key, count(*)
FROM (
SELECT jsonb_object_keys(geo_coord_all) AS geo_coord_key
FROM biosample2
) AS subquery
GROUP BY
geo_coord_key
ORDER BY COUNT(*) DESC-
get key counts
SELECT geo_coord_key, count(*) FROM ( SELECT jsonb_object_keys(geo_text_all) AS geo_coord_key FROM biosample4 ) AS subquery GROUP BY geo_coord_key ORDER BY COUNT(*) DESC
-
go through and validate false positives
select * from biosample4 where geo_text_all::text like '%tumour%' limit 1000
# copy biosample_set.xml.gz
wget -qO- https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz | aws s3 cp - s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml.gz
# unzip to biosample_set.xml
aws s3 cp s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml.gz - | zcat | aws s3 cp - s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml
# preview XML head
zcat < biosample_set.xml.gz | head -n 100000 > head.xml
# find occurrences of latitude_and_longitude in XML
aws s3 cp s3://serratus-public/notebook/210212_geo/victorlin/biosample_parse/biosample_set.xml - | grep --line-buffered latitude_and_longitude-
fix XML mismatch error in last batch
start,end = 42497641724,42509614403
-
geo text cleaning
- unicode
- check for country names or other keywords
-
geocoding
-
parse fuzzy dates (https://dateparser.readthedocs.io/en/latest/ ?)