forked from blotterfyi/velocity
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
69 lines (60 loc) · 2.37 KB
/
parser.py
File metadata and controls
69 lines (60 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import re
import pytz
import json
import shutil
import shelve
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from collections import defaultdict
from logger import get_logger
logger = get_logger(__name__)
class Parser:
def __init__(self):
pass
def parse_sec_filing(self, html_content):
"""
Parse SEC filing HTML content and extract relevant sections.
Args:
html_content (str): The HTML content of the SEC filing.
Returns:
dict: A dictionary containing parsed sections of the SEC filing.
Keys are section names, and values are the corresponding content.
"""
soup = BeautifulSoup(html_content, 'html.parser')
sections = defaultdict(list)
current_section = None
for element in soup.find_all(['span', 'div']):
style = element.get('style', '')
if 'font-weight:700' in style or 'font-weight: 700' in style:
text = element.get_text(strip=True)
if text.lower().startswith("item "):
current_section = text
sections[current_section] = []
elif current_section:
text = element.get_text(strip=True)
if text:
sections[current_section].append(text)
# Clean up section names
cleaned_sections = {}
for section, content in sections.items():
if len(content) == 0:
continue
content = [c for c in content if len(c.split(" ")) > 15]
content = '\n'.join(content)
words_in_content = len(content.split(" "))
if words_in_content < 20:
continue
section = str(section).split(".")[1]
section = section.strip(" ")
section = ' '.join(section.split())
# first four words are section_key
section_key = '_'.join(section.split()[:4])
section_key = section_key.replace(",", "")
section_key = section_key.replace("’", "")
section_key = str(section_key).lower()
# make these more readable for an LLM
if section_key == "business":
section_key = "business_info"
cleaned_sections[section_key] = content
return cleaned_sections