From 528c9dbfb56a0feb73d7d35ef3e9f0a04d4895f8 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 22 Aug 2023 12:07:13 +0200 Subject: [PATCH 1/9] Add parameters as dict --- get_block.py | 5 +++-- get_course.py | 5 +++-- get_lecture.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/get_block.py b/get_block.py index 76e61c0..a854f61 100644 --- a/get_block.py +++ b/get_block.py @@ -16,8 +16,9 @@ def get_lecture_block(id: str): --------- id (str): The id of the lecture block to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/blocks/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/blocks/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() diff --git a/get_course.py b/get_course.py index 9e256d3..bc59256 100644 --- a/get_course.py +++ b/get_course.py @@ -20,8 +20,9 @@ def get_course(id: str): --------- id (str): The id of the course to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/courses/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/courses/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() diff --git a/get_lecture.py b/get_lecture.py index bebe660..9ea76b1 100644 --- a/get_lecture.py +++ b/get_lecture.py @@ -17,8 +17,9 @@ def get_lecture(id: str): --------- id (str): The id of the lecture block to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/lectures/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/lectures/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() From c663855879af8dcb39a4949bc20578b9abb0bc0e Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 22 Aug 2023 12:07:49 +0200 Subject: [PATCH 2/9] Handle embedded images locally --- write_out_course.py | 70 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/write_out_course.py b/write_out_course.py index 6a18499..be24800 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -17,20 +17,56 @@ """ from os.path import exists -from os import mkdir +from os import mkdir, rmdir from typing import Dict, Union, Any import requests +import re from get_block import get_lecture_block from get_course import get_course from get_lecture import get_lecture +def regular_expression_markdown_image() -> str: + """Returns a regular expression to match markdown image and return the url + + Returns + ------- + str: A regular expression that matches markdown image references + """ + return r'!\[.*\]\(.*\)' + + +def extract_urls(line: str) -> list: + """Extract one or more urls from a markdown image reference + + For example: + + ```markdown + ![alt text](bla_bla.png)![another text](second_image.png) + ``` + will return + ```python + ['bla_bla.png', 'second_image.png'] + ``` + Arguments + --------- + line (str): A line containing one or more markdown image references + + Returns + ------- + list: A list of urls + """ + expression = regular_expression_markdown_image() + urls = re.findall(expression, line) + return [x.split('(')[1].split(')')[0] for x in urls] + + def extract_images(document: str, destination_folder: str): """Extract all images from a markdown document, document to assets subfolder and replace with local references Iterates through each line in a markdown document. - Extracts all ![img](url) references, downloads the image and saves it to the assets. + Extracts all ![img](url) references using regular expression, downloads the image and saves it to the assets. Then replaces the original reference with a local reference. Arguments @@ -38,20 +74,22 @@ def extract_images(document: str, destination_folder: str): document (str): The markdown document to extract images from """ for line in document.split('\n'): - if line.startswith('!['): - url = line.split('(')[1].split(')')[0] - filename = url.split('/')[-1] - print(f"Downloading {url} to {destination_folder}/{filename}") - try: - image = requests.get(url, allow_redirects=True) - if image.status_code == 200: - document = document.replace(url, f"assets/{filename}") - with open(f"{destination_folder}/{filename}", 'wb') as f: - f.write(image.content) - else: - print(f"Error downloading {url}") - except Exception as e: - print(f"Error downloading {url}: {e}") + expression = regular_expression_markdown_image() + if re.match(expression, line): + urls = extract_urls(line) + for url in urls: + filename = url.split('/')[-1] + print(f"Downloading {url} to {destination_folder}/{filename}") + try: + image = requests.get(url, allow_redirects=True) + if image.status_code == 200: + document = document.replace(url, f"assets/{filename}") + with open(f"{destination_folder}/{filename}", 'wb') as f: + f.write(image.content) + else: + print(f"Error downloading {url}") + except Exception as e: + print(f"Error downloading {url}: {e}") return document From 1cc67aa42b98053b6a509c6d7c552ac02ff14fd7 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 22 Aug 2023 15:17:49 +0200 Subject: [PATCH 3/9] Skip unpublished blocks --- get_block.py | 7 ++++++- get_lecture.py | 11 +++++++---- write_out_course.py | 11 +++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/get_block.py b/get_block.py index a854f61..8baf537 100644 --- a/get_block.py +++ b/get_block.py @@ -33,11 +33,16 @@ def print_keys(dict: Union[Dict, Any]): if __name__ == "__main__": - block = get_lecture_block(4) + block = get_lecture_block(5) attributes = block['data']['attributes'] print(f"Attributes: {attributes.keys()}") + print(f"Version number: {attributes['versionNumber']}") + print(f"Version: {[x['id'] for x in attributes['versions']['data']]}") + + print(f"Published at: {attributes['publishedAt']}") + lectures = [x['attributes'] for x in attributes['Lectures']['data']] for lecture in lectures: print(f"This block is part of lecture '{lecture['Title']}'") diff --git a/get_lecture.py b/get_lecture.py index 9ea76b1..8427793 100644 --- a/get_lecture.py +++ b/get_lecture.py @@ -40,8 +40,11 @@ def print_keys(dict: Union[Dict, Any]): print(f"Attributes: {attributes.keys()}") + print(f"Version number: {attributes['versionNumber']}") + print(f"Version: {attributes['versions']}") + blocks = attributes['Blocks']['data'] - print(blocks) + # print(blocks) for block in blocks: print(f"This lecture contains block {block['id']}: '{block['attributes']['Title']}'") @@ -60,7 +63,7 @@ def print_keys(dict: Union[Dict, Any]): lecture_id = lecture['data']['id'] document = attributes['Abstract'] - with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: - markdown_file.write(f"# {title}\n\n") - markdown_file.write(document) + # with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: + # markdown_file.write(f"# {title}\n\n") + # markdown_file.write(document) diff --git a/write_out_course.py b/write_out_course.py index be24800..a7d781f 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -114,10 +114,13 @@ def main(course_id: int, destination_folder: str) -> bool: blocks = [get_lecture_block(x['id']) for x in lecture['data']['attributes']['Blocks']['data']] for block in blocks: - block_path = f"{lecture_path}/{block['data']['id']}" - with open(f"{block_path}.md", 'w') as f: - document = extract_images(block['data']['attributes']['Document'], assets_path) - f.write(document) + # Skip blocks which are not published + if block['data']['attributes']['publishedAt']: + block_path = f"{lecture_path}/{block['data']['id']}" + with open(f"{block_path}.md", 'w') as markdown_file: + block_document = block['data']['attributes']['Document'] + document = extract_images(block_document, assets_path) + markdown_file.write(document) return success From d1d0d09f13ded47cb75a45253221accff5be0f27 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 21 Sep 2023 18:14:43 +0100 Subject: [PATCH 4/9] Write out the metadata as YAML --- .gitignore | 1 + get_lecture.py | 1 - write_out_course.py | 17 ++++++++++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index ee2a38d..aef15ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Outputs *.md +*.png # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/get_lecture.py b/get_lecture.py index 8427793..8309def 100644 --- a/get_lecture.py +++ b/get_lecture.py @@ -4,7 +4,6 @@ >>> python get_lecture.py - """ import requests from typing import Dict, Union, Any diff --git a/write_out_course.py b/write_out_course.py index a7d781f..d9f2335 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -22,6 +22,8 @@ import requests import re +from yaml import dump + from get_block import get_lecture_block from get_course import get_course from get_lecture import get_lecture @@ -120,11 +122,24 @@ def main(course_id: int, destination_folder: str) -> bool: with open(f"{block_path}.md", 'w') as markdown_file: block_document = block['data']['attributes']['Document'] document = extract_images(block_document, assets_path) + + # Extract metadata for the top of the block + title = block['data']['attributes']['Title'] + author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] + authors = [] + for author in author_list: + authors.append(f"{author['FirstName']}, {author['LastName']}") + # Write the metadata as a YAML block to the top of the markdown file + yaml_dict = {'title': title, 'authors': authors} + yaml_meta = dump(yaml_dict, default_flow_style=False) + markdown_file.write('---\n') + markdown_file.write(yaml_meta) + markdown_file.write('---\n\n') markdown_file.write(document) return success if __name__ == "__main__": - success = main(2, "course_2") + success = main(2, "infra") print(success) \ No newline at end of file From b53d405fde00819278095ea17e369b32f3c3980f Mon Sep 17 00:00:00 2001 From: Reema Mohanty Date: Wed, 27 Sep 2023 14:50:37 +0100 Subject: [PATCH 5/9] New file added --- get_block.py | 1 - 1 file changed, 1 deletion(-) diff --git a/get_block.py b/get_block.py index 8baf537..cc81e6e 100644 --- a/get_block.py +++ b/get_block.py @@ -11,7 +11,6 @@ def get_lecture_block(id: str): """Retrieves the contents of a lecture block from the teaching kit website - Arguments --------- id (str): The id of the lecture block to retrieve From 843234d5e173e8eb79869aea4ebb6fcec016b81f Mon Sep 17 00:00:00 2001 From: Reema Mohanty Date: Wed, 4 Oct 2023 13:41:27 +0100 Subject: [PATCH 6/9] Changes added --- get_block.py | 1 + get_course.py | 15 +------------- get_keywrd.py | 50 +++++++++++++++++++++++++++++++++++++++++++++ get_lecture.py | 5 ++++- write_out_course.py | 5 +++-- 5 files changed, 59 insertions(+), 17 deletions(-) create mode 100644 get_keywrd.py diff --git a/get_block.py b/get_block.py index cc81e6e..f4f13d0 100644 --- a/get_block.py +++ b/get_block.py @@ -5,6 +5,7 @@ >>> python get_block.py """ +from matplotlib.widgets import Slider, SliderBase import requests from typing import Dict, Union, Any from json import loads, dumps diff --git a/get_course.py b/get_course.py index bc59256..4974505 100644 --- a/get_course.py +++ b/get_course.py @@ -44,6 +44,7 @@ def print_keys(dict: Union[Dict, Any]): title = attributes['Title'] print(f"Course title: {title}") + outcomes = [x['LearningOutcome'] for x in attributes['LearningOutcomes']] print(f"Outcomes: {outcomes}") @@ -53,17 +54,3 @@ def print_keys(dict: Union[Dict, Any]): print(f"This course contains lecture: '{lecture['id']}: {lecture['attributes']['Title']}'") - # authors = [x['attributes'] for x in attributes['LectureCreators']['data']] - - # print("This lecture lecture was written by:") - # for author in authors: - # print(f"{author['FirstName']} {author['LastName']} {author['Email']} {author['ORCID']}") - - - - # lecture_id = lecture['data']['id'] - # document = attributes['Abstract'] - # with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: - # markdown_file.write(f"# {title}\n\n") - # markdown_file.write(document) - diff --git a/get_keywrd.py b/get_keywrd.py new file mode 100644 index 0000000..885890d --- /dev/null +++ b/get_keywrd.py @@ -0,0 +1,50 @@ +import nltk +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.probability import FreqDist +import markdown + +from get_lecture import get_lecture + +# Download NLTK data if not already downloaded +nltk.download('punkt') +nltk.download('stopwords') + +# Function to extract keywords from a text +def extract_keywords(text): + # Tokenize the text + words = word_tokenize(text.lower()) + + # Remove stopwords and punctuation + stopwords_set = set(stopwords.words('english')) + words = [word for word in words if word.isalnum() and word not in stopwords_set] + + # Calculate word frequency + word_freq = FreqDist(words) + + # Select the top 10 most frequent words as keywords + keywords = [word for word, _ in word_freq.most_common(10)] + + return keywords + + #Extract keywords from the lecture note +keywords = extract_keywords('get_lecture') + +# Generate Markdown content with keywords +#markdown_content = "\n".join([f"- {keyword}" for keyword in keywords]) + +# Save the Markdown content to a file +#with open("keywords.md", "w") as file: + #file.write(markdown_content) + +#print("Keywords extracted and saved to 'keywords.md'") + +# Optional: Print the extracted keywords +#print("Extracted Keywords:") +for keyword in keywords: + print(keyword) + + + + + diff --git a/get_lecture.py b/get_lecture.py index 8309def..3db36d3 100644 --- a/get_lecture.py +++ b/get_lecture.py @@ -6,6 +6,7 @@ """ import requests +#import keywords from typing import Dict, Union, Any @@ -45,7 +46,7 @@ def print_keys(dict: Union[Dict, Any]): blocks = attributes['Blocks']['data'] # print(blocks) for block in blocks: - print(f"This lecture contains block {block['id']}: '{block['attributes']['Title']}'") + print(f"This lecture contains block {block['id']}: '{block['attributes']['Title']}'") #'{block['attributes']['keywords']}") authors = [x['attributes'] for x in attributes['LectureCreators']['data']] @@ -59,6 +60,8 @@ def print_keys(dict: Union[Dict, Any]): title = attributes['Title'] print(title) + keywords = attributes['keywords'] + print(keywords) lecture_id = lecture['data']['id'] document = attributes['Abstract'] diff --git a/write_out_course.py b/write_out_course.py index d9f2335..daa9513 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -27,6 +27,7 @@ from get_block import get_lecture_block from get_course import get_course from get_lecture import get_lecture +from get_keywrd import keywords def regular_expression_markdown_image() -> str: @@ -125,12 +126,13 @@ def main(course_id: int, destination_folder: str) -> bool: # Extract metadata for the top of the block title = block['data']['attributes']['Title'] + #keywords = block['data']['attributes']['keywords'] author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] authors = [] for author in author_list: authors.append(f"{author['FirstName']}, {author['LastName']}") # Write the metadata as a YAML block to the top of the markdown file - yaml_dict = {'title': title, 'authors': authors} + yaml_dict = {'title': title, 'authors': authors,'keywords': keywords} #'keywords': keywords yaml_meta = dump(yaml_dict, default_flow_style=False) markdown_file.write('---\n') markdown_file.write(yaml_meta) @@ -139,7 +141,6 @@ def main(course_id: int, destination_folder: str) -> bool: return success - if __name__ == "__main__": success = main(2, "infra") print(success) \ No newline at end of file From 34e8591c32e8158ef2057acdfd29667b5c8d352f Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 17 Oct 2023 11:49:52 +0200 Subject: [PATCH 7/9] Added keywords with fixed 'en' locale to markdown --- write_out_course.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/write_out_course.py b/write_out_course.py index daa9513..2c2565a 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -27,7 +27,6 @@ from get_block import get_lecture_block from get_course import get_course from get_lecture import get_lecture -from get_keywrd import keywords def regular_expression_markdown_image() -> str: @@ -126,13 +125,18 @@ def main(course_id: int, destination_folder: str) -> bool: # Extract metadata for the top of the block title = block['data']['attributes']['Title'] - #keywords = block['data']['attributes']['keywords'] + + keywords = [] + for keyword in [x['attributes'] for x in block['data']['attributes']['Keywords']['data']]: + if 'en' in keyword['locale']: + keywords.append(keyword['Keyword']) + author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] authors = [] for author in author_list: authors.append(f"{author['FirstName']}, {author['LastName']}") # Write the metadata as a YAML block to the top of the markdown file - yaml_dict = {'title': title, 'authors': authors,'keywords': keywords} #'keywords': keywords + yaml_dict = {'title': title, 'authors': authors,'keywords': keywords} yaml_meta = dump(yaml_dict, default_flow_style=False) markdown_file.write('---\n') markdown_file.write(yaml_meta) From c17c1261ad38b53c5e28c2e9d3ae1bf7d9576d05 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 17 Oct 2023 12:10:02 +0200 Subject: [PATCH 8/9] Fixed regular expression when multiple images in a line --- README.md | 4 ++-- test_write_out_course.py | 8 ++++++++ write_out_course.py | 11 +++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 test_write_out_course.py diff --git a/README.md b/README.md index 36160c9..3a163c2 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ and the format to convert the material to. ## Remaining development -- Only return blocks and lectures which are published, and the latest version (normally only the latest version is published, but not necessarily). +- [ ] Only return blocks and lectures which are published, and the latest version (normally only the latest version is published, but not necessarily). Currently the relations store all blocks, including earlier versions of the same block, that are included in a lecture. The same for lectures and courses. -- We need to re-do the embedding of images in the markdown documents. Images are stored in the media library, but are then uploaded to an amazon server, so these can be downloaded from the urls embedded in the document and then redirected to a local location. +- [ ] We need to re-do the embedding of images in the markdown documents. Images are stored in the media library, but are then uploaded to an amazon server, so these can be downloaded from the urls embedded in the document and then redirected to a local location. diff --git a/test_write_out_course.py b/test_write_out_course.py new file mode 100644 index 0000000..1af5fb6 --- /dev/null +++ b/test_write_out_course.py @@ -0,0 +1,8 @@ +from write_out_course import extract_urls + +def test_extract_urls(): + + test_string = "![Fig_1.4.2b.png](https://test.com/1.png)![Fig_1.4.2c.png](https://test.com/2.png)" + urls = extract_urls(test_string) + print(urls) + assert len(urls) == 2 \ No newline at end of file diff --git a/write_out_course.py b/write_out_course.py index 2c2565a..ffe6709 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -36,7 +36,7 @@ def regular_expression_markdown_image() -> str: ------- str: A regular expression that matches markdown image references """ - return r'!\[.*\]\(.*\)' + return r'!\[.*?\]\(.*?\)' def extract_urls(line: str) -> list: @@ -63,6 +63,9 @@ def extract_urls(line: str) -> list: urls = re.findall(expression, line) return [x.split('(')[1].split(')')[0] for x in urls] +def check_for_multiple_images(line: str) -> bool: + if len(line.split('!')) > 1: + return True def extract_images(document: str, destination_folder: str): """Extract all images from a markdown document, document to assets subfolder and replace with local references @@ -73,7 +76,11 @@ def extract_images(document: str, destination_folder: str): Arguments ---------- - document (str): The markdown document to extract images from + document: str + The markdown document to extract images from + destination_folder : str + The folder in which to place all the image files + """ for line in document.split('\n'): expression = regular_expression_markdown_image() From f4306a64474188734e6b542f9d31fe43207e76af Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 17 Oct 2023 12:44:02 +0200 Subject: [PATCH 9/9] Small refactor for readability --- write_out_course.py | 62 ++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/write_out_course.py b/write_out_course.py index ffe6709..eba747c 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -102,12 +102,45 @@ def extract_images(document: str, destination_folder: str): return document +def extract_metadata(block): + """Extract metadata from a block. + + Arguments + --------- + block: dict + The block data from which metadata is to be extracted. + + Returns + ------- + dict + A dictionary containing block metadata with keys 'title', 'authors', and 'keywords'. + """ + # Extract metadata for the top of the block + title = block['data']['attributes']['Title'] + + keywords = [] + for keyword in [x['attributes'] for x in block['data']['attributes']['Keywords']['data']]: + if 'en' in keyword['locale']: + keywords.append(keyword['Keyword']) + + author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] + authors = [] + for author in author_list: + authors.append(f"{author['FirstName']}, {author['LastName']}") + # Write the metadata as a YAML block to the top of the markdown file + return {'title': title, 'authors': authors, 'keywords': keywords} + + +def create_directory_if_not_exists(directory): + if not exists(directory): + mkdir(directory) + + def main(course_id: int, destination_folder: str) -> bool: success = True - if not exists(destination_folder): - mkdir(destination_folder) + create_directory_if_not_exists(destination_folder) course = get_course(course_id) lecture_id: list = [x['id'] for x in course['data']['attributes']['Lectures']['data']] @@ -116,12 +149,11 @@ def main(course_id: int, destination_folder: str) -> bool: lecture_path = f"{destination_folder}/lecture_{lecture['data']['id']}" assets_path = f"{lecture_path}/assets" - if not exists(lecture_path): - mkdir(lecture_path) - if not exists(assets_path): - mkdir(assets_path) + create_directory_if_not_exists(lecture_path) + create_directory_if_not_exists(assets_path) blocks = [get_lecture_block(x['id']) for x in lecture['data']['attributes']['Blocks']['data']] + for block in blocks: # Skip blocks which are not published if block['data']['attributes']['publishedAt']: @@ -130,20 +162,7 @@ def main(course_id: int, destination_folder: str) -> bool: block_document = block['data']['attributes']['Document'] document = extract_images(block_document, assets_path) - # Extract metadata for the top of the block - title = block['data']['attributes']['Title'] - - keywords = [] - for keyword in [x['attributes'] for x in block['data']['attributes']['Keywords']['data']]: - if 'en' in keyword['locale']: - keywords.append(keyword['Keyword']) - - author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] - authors = [] - for author in author_list: - authors.append(f"{author['FirstName']}, {author['LastName']}") - # Write the metadata as a YAML block to the top of the markdown file - yaml_dict = {'title': title, 'authors': authors,'keywords': keywords} + yaml_dict = extract_metadata(block) yaml_meta = dump(yaml_dict, default_flow_style=False) markdown_file.write('---\n') markdown_file.write(yaml_meta) @@ -152,6 +171,7 @@ def main(course_id: int, destination_folder: str) -> bool: return success + if __name__ == "__main__": success = main(2, "infra") - print(success) \ No newline at end of file + print(success)