diff --git a/.gitignore b/.gitignore index ee2a38d..aef15ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Outputs *.md +*.png # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 36160c9..3a163c2 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ and the format to convert the material to. ## Remaining development -- Only return blocks and lectures which are published, and the latest version (normally only the latest version is published, but not necessarily). +- [ ] Only return blocks and lectures which are published, and the latest version (normally only the latest version is published, but not necessarily). Currently the relations store all blocks, including earlier versions of the same block, that are included in a lecture. The same for lectures and courses. -- We need to re-do the embedding of images in the markdown documents. Images are stored in the media library, but are then uploaded to an amazon server, so these can be downloaded from the urls embedded in the document and then redirected to a local location. +- [ ] We need to re-do the embedding of images in the markdown documents. Images are stored in the media library, but are then uploaded to an amazon server, so these can be downloaded from the urls embedded in the document and then redirected to a local location. diff --git a/get_block.py b/get_block.py index 76e61c0..f4f13d0 100644 --- a/get_block.py +++ b/get_block.py @@ -5,19 +5,20 @@ >>> python get_block.py """ +from matplotlib.widgets import Slider, SliderBase import requests from typing import Dict, Union, Any from json import loads, dumps def get_lecture_block(id: str): """Retrieves the contents of a lecture block from the teaching kit website - Arguments --------- id (str): The id of the lecture block to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/blocks/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/blocks/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() @@ -32,11 +33,16 @@ def print_keys(dict: Union[Dict, Any]): if __name__ == "__main__": - block = get_lecture_block(4) + block = get_lecture_block(5) attributes = block['data']['attributes'] print(f"Attributes: {attributes.keys()}") + print(f"Version number: {attributes['versionNumber']}") + print(f"Version: {[x['id'] for x in attributes['versions']['data']]}") + + print(f"Published at: {attributes['publishedAt']}") + lectures = [x['attributes'] for x in attributes['Lectures']['data']] for lecture in lectures: print(f"This block is part of lecture '{lecture['Title']}'") diff --git a/get_course.py b/get_course.py index 9e256d3..4974505 100644 --- a/get_course.py +++ b/get_course.py @@ -20,8 +20,9 @@ def get_course(id: str): --------- id (str): The id of the course to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/courses/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/courses/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() @@ -43,6 +44,7 @@ def print_keys(dict: Union[Dict, Any]): title = attributes['Title'] print(f"Course title: {title}") + outcomes = [x['LearningOutcome'] for x in attributes['LearningOutcomes']] print(f"Outcomes: {outcomes}") @@ -52,17 +54,3 @@ def print_keys(dict: Union[Dict, Any]): print(f"This course contains lecture: '{lecture['id']}: {lecture['attributes']['Title']}'") - # authors = [x['attributes'] for x in attributes['LectureCreators']['data']] - - # print("This lecture lecture was written by:") - # for author in authors: - # print(f"{author['FirstName']} {author['LastName']} {author['Email']} {author['ORCID']}") - - - - # lecture_id = lecture['data']['id'] - # document = attributes['Abstract'] - # with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: - # markdown_file.write(f"# {title}\n\n") - # markdown_file.write(document) - diff --git a/get_keywrd.py b/get_keywrd.py new file mode 100644 index 0000000..885890d --- /dev/null +++ b/get_keywrd.py @@ -0,0 +1,50 @@ +import nltk +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.probability import FreqDist +import markdown + +from get_lecture import get_lecture + +# Download NLTK data if not already downloaded +nltk.download('punkt') +nltk.download('stopwords') + +# Function to extract keywords from a text +def extract_keywords(text): + # Tokenize the text + words = word_tokenize(text.lower()) + + # Remove stopwords and punctuation + stopwords_set = set(stopwords.words('english')) + words = [word for word in words if word.isalnum() and word not in stopwords_set] + + # Calculate word frequency + word_freq = FreqDist(words) + + # Select the top 10 most frequent words as keywords + keywords = [word for word, _ in word_freq.most_common(10)] + + return keywords + + #Extract keywords from the lecture note +keywords = extract_keywords('get_lecture') + +# Generate Markdown content with keywords +#markdown_content = "\n".join([f"- {keyword}" for keyword in keywords]) + +# Save the Markdown content to a file +#with open("keywords.md", "w") as file: + #file.write(markdown_content) + +#print("Keywords extracted and saved to 'keywords.md'") + +# Optional: Print the extracted keywords +#print("Extracted Keywords:") +for keyword in keywords: + print(keyword) + + + + + diff --git a/get_lecture.py b/get_lecture.py index bebe660..3db36d3 100644 --- a/get_lecture.py +++ b/get_lecture.py @@ -4,9 +4,9 @@ >>> python get_lecture.py - """ import requests +#import keywords from typing import Dict, Union, Any @@ -17,8 +17,9 @@ def get_lecture(id: str): --------- id (str): The id of the lecture block to retrieve """ - url = f"https://teachingkit.climatecompatiblegrowth.com/api/lectures/{id}?locale=en&populate=*" - response = requests.get(url) + url = f"https://teachingkit.climatecompatiblegrowth.com/api/lectures/{id}" + payload = {'locale': 'en', 'populate': '*'} + response = requests.get(url, params=payload) return response.json() @@ -39,10 +40,13 @@ def print_keys(dict: Union[Dict, Any]): print(f"Attributes: {attributes.keys()}") + print(f"Version number: {attributes['versionNumber']}") + print(f"Version: {attributes['versions']}") + blocks = attributes['Blocks']['data'] - print(blocks) + # print(blocks) for block in blocks: - print(f"This lecture contains block {block['id']}: '{block['attributes']['Title']}'") + print(f"This lecture contains block {block['id']}: '{block['attributes']['Title']}'") #'{block['attributes']['keywords']}") authors = [x['attributes'] for x in attributes['LectureCreators']['data']] @@ -56,10 +60,12 @@ def print_keys(dict: Union[Dict, Any]): title = attributes['Title'] print(title) + keywords = attributes['keywords'] + print(keywords) lecture_id = lecture['data']['id'] document = attributes['Abstract'] - with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: - markdown_file.write(f"# {title}\n\n") - markdown_file.write(document) + # with open(f"lecture_{lecture_id}.md", 'wt') as markdown_file: + # markdown_file.write(f"# {title}\n\n") + # markdown_file.write(document) diff --git a/test_write_out_course.py b/test_write_out_course.py new file mode 100644 index 0000000..1af5fb6 --- /dev/null +++ b/test_write_out_course.py @@ -0,0 +1,8 @@ +from write_out_course import extract_urls + +def test_extract_urls(): + + test_string = "![Fig_1.4.2b.png](https://test.com/1.png)![Fig_1.4.2c.png](https://test.com/2.png)" + urls = extract_urls(test_string) + print(urls) + assert len(urls) == 2 \ No newline at end of file diff --git a/write_out_course.py b/write_out_course.py index 6a18499..eba747c 100644 --- a/write_out_course.py +++ b/write_out_course.py @@ -17,50 +17,130 @@ """ from os.path import exists -from os import mkdir +from os import mkdir, rmdir from typing import Dict, Union, Any import requests +import re + +from yaml import dump from get_block import get_lecture_block from get_course import get_course from get_lecture import get_lecture +def regular_expression_markdown_image() -> str: + """Returns a regular expression to match markdown image and return the url + + Returns + ------- + str: A regular expression that matches markdown image references + """ + return r'!\[.*?\]\(.*?\)' + + +def extract_urls(line: str) -> list: + """Extract one or more urls from a markdown image reference + + For example: + + ```markdown + ![alt text](bla_bla.png)![another text](second_image.png) + ``` + will return + ```python + ['bla_bla.png', 'second_image.png'] + ``` + Arguments + --------- + line (str): A line containing one or more markdown image references + + Returns + ------- + list: A list of urls + """ + expression = regular_expression_markdown_image() + urls = re.findall(expression, line) + return [x.split('(')[1].split(')')[0] for x in urls] + +def check_for_multiple_images(line: str) -> bool: + if len(line.split('!')) > 1: + return True + def extract_images(document: str, destination_folder: str): """Extract all images from a markdown document, document to assets subfolder and replace with local references Iterates through each line in a markdown document. - Extracts all ![img](url) references, downloads the image and saves it to the assets. + Extracts all ![img](url) references using regular expression, downloads the image and saves it to the assets. Then replaces the original reference with a local reference. Arguments ---------- - document (str): The markdown document to extract images from + document: str + The markdown document to extract images from + destination_folder : str + The folder in which to place all the image files + """ for line in document.split('\n'): - if line.startswith('!['): - url = line.split('(')[1].split(')')[0] - filename = url.split('/')[-1] - print(f"Downloading {url} to {destination_folder}/{filename}") - try: - image = requests.get(url, allow_redirects=True) - if image.status_code == 200: - document = document.replace(url, f"assets/{filename}") - with open(f"{destination_folder}/{filename}", 'wb') as f: - f.write(image.content) - else: - print(f"Error downloading {url}") - except Exception as e: - print(f"Error downloading {url}: {e}") + expression = regular_expression_markdown_image() + if re.match(expression, line): + urls = extract_urls(line) + for url in urls: + filename = url.split('/')[-1] + print(f"Downloading {url} to {destination_folder}/{filename}") + try: + image = requests.get(url, allow_redirects=True) + if image.status_code == 200: + document = document.replace(url, f"assets/{filename}") + with open(f"{destination_folder}/{filename}", 'wb') as f: + f.write(image.content) + else: + print(f"Error downloading {url}") + except Exception as e: + print(f"Error downloading {url}: {e}") return document +def extract_metadata(block): + """Extract metadata from a block. + + Arguments + --------- + block: dict + The block data from which metadata is to be extracted. + + Returns + ------- + dict + A dictionary containing block metadata with keys 'title', 'authors', and 'keywords'. + """ + # Extract metadata for the top of the block + title = block['data']['attributes']['Title'] + + keywords = [] + for keyword in [x['attributes'] for x in block['data']['attributes']['Keywords']['data']]: + if 'en' in keyword['locale']: + keywords.append(keyword['Keyword']) + + author_list = [x['attributes'] for x in block['data']['attributes']['Authors']['data']] + authors = [] + for author in author_list: + authors.append(f"{author['FirstName']}, {author['LastName']}") + # Write the metadata as a YAML block to the top of the markdown file + return {'title': title, 'authors': authors, 'keywords': keywords} + + +def create_directory_if_not_exists(directory): + if not exists(directory): + mkdir(directory) + + def main(course_id: int, destination_folder: str) -> bool: success = True - if not exists(destination_folder): - mkdir(destination_folder) + create_directory_if_not_exists(destination_folder) course = get_course(course_id) lecture_id: list = [x['id'] for x in course['data']['attributes']['Lectures']['data']] @@ -69,21 +149,29 @@ def main(course_id: int, destination_folder: str) -> bool: lecture_path = f"{destination_folder}/lecture_{lecture['data']['id']}" assets_path = f"{lecture_path}/assets" - if not exists(lecture_path): - mkdir(lecture_path) - if not exists(assets_path): - mkdir(assets_path) + create_directory_if_not_exists(lecture_path) + create_directory_if_not_exists(assets_path) blocks = [get_lecture_block(x['id']) for x in lecture['data']['attributes']['Blocks']['data']] + for block in blocks: - block_path = f"{lecture_path}/{block['data']['id']}" - with open(f"{block_path}.md", 'w') as f: - document = extract_images(block['data']['attributes']['Document'], assets_path) - f.write(document) + # Skip blocks which are not published + if block['data']['attributes']['publishedAt']: + block_path = f"{lecture_path}/{block['data']['id']}" + with open(f"{block_path}.md", 'w') as markdown_file: + block_document = block['data']['attributes']['Document'] + document = extract_images(block_document, assets_path) + + yaml_dict = extract_metadata(block) + yaml_meta = dump(yaml_dict, default_flow_style=False) + markdown_file.write('---\n') + markdown_file.write(yaml_meta) + markdown_file.write('---\n\n') + markdown_file.write(document) return success if __name__ == "__main__": - success = main(2, "course_2") - print(success) \ No newline at end of file + success = main(2, "infra") + print(success)