python_learning/Leetcode/leetcode_parser.py at master · Dchamel/python_learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# This prog creates a Ready-made template
# for solving problems from leetcode.com for Python3
# It consists of:
# leetcode Scrapper
# leetcode Parser


import json
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import codecs
import html
from re import search

# INPUT LEETCODE TASK URL BELOW
URL = 'https://leetcode.com/problems/reverse-string/'
# input Your NEXT Number of the file in folder
file_num = 34

# Run Scrapper
# You need to download and unpack compatible ChromeDriver
# for your Google Chrome web browser
# https://chromedriver.chromium.org/downloads
# and change executable_path to the path where your driver lies
service = Service(executable_path=r'C:\Chrome\chromedriver.exe')

options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(service=service, options=options)
driver.get(URL)
html_raw = driver.page_source

# Save to HTML
completeName = os.path.join('', 'tmp.html')
file_object = codecs.open(completeName, "w", "utf-8")
file_object.write(html_raw)

driver.quit()


# FOR DEL
# soup = BeautifulSoup(html_raw, "html.parser")
# script = soup.find('script', {'id': '__NEXT_DATA__'})
# data = json.loads(script.get_text(strip=True))
#
# # Write it to the file
# with open("tmp.json", "w") as f:
#     json.dump(data, f)
#
# with open('tmp.json', 'r') as f:
#     x = json.load(f)


# Parse data from RAW html
def parse_html(html: str) -> str:
    """Converting HTML -> Text"""

    elem = BeautifulSoup(html, features="html.parser")
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e.strip()
        elif e.name in ['br', 'p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th']:
            text += '\n'
        elif e.name == 'li':
            text += '\n- '
        elif e.name == 'sup':
            text += '^'
    return text


with open("tmp.html", "r", encoding='utf-8') as f:
    html_data = f.read()

soup = BeautifulSoup(html_data, "lxml")
script = soup.find('script', {'id': '__NEXT_DATA__'}).text
data = json.loads(script)

# data = json.dumps(data, indent=4, sort_keys=True)
# with open('tmp.json', 'w') as f:
# json.dump(data, f)

task_num = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['question'][
    'questionFrontendId']

task_title = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['question'][
    'title']
task_content = data['props']['pageProps']['dehydratedState']['queries'][6]['state']['data']['question']['content']
task_code_func = data['props']['pageProps']['dehydratedState']['queries'][10]['state']['data']['question'][
    'codeSnippets'][3]['code']
task_code_func = task_code_func[task_code_func.index('def'):].strip()
task_code_func_name = search(r'\s([a-zA-Z0-9]*)\(', task_code_func).groups()[0]


def main_text_split(task_content: str) -> tuple[str, list[list[str]]]:
    """Split content onto two item:
    Returned element is a tuple of two items.
    First item: commented text at the beginning.
    Second item: examples list with vars for Task
    """

    task_content_unescape = html.unescape(task_content)
    task_content_unescape = task_content_unescape.replace('<code>', '{')
    task_content_unescape = task_content_unescape.replace('</code>', '}')

    main_text = BeautifulSoup(task_content_unescape, features="html.parser")

    examples = main_text.find_all('pre')

    for s in main_text.select('strong.example'):
        s.extract()
    for s in main_text.select('pre'):
        s.extract()
    for s in main_text.select('p'):
        if len(s.text.strip()) == 0:
            s.extract()

    for s in main_text.select('strong'):
        s.unwrap()
    for s in main_text.select('em'):
        s.unwrap()

    main_text = parse_html(str(main_text))

    # Preparing all examples for split
    raw_examples_list = []
    for raw_example in examples:
        raw_examples_list.append(raw_example.text)

    # Splitting Examples and putting them to final list
    examples_list_splitted = []
    for raw_example in raw_examples_list:
        examples_list_splitted.append(raw_example.split('\n'))
    examples_list_4_vars = []
    for example in examples_list_splitted:
        example_str_list = [i for i in example if i]
        example_str_list2 = []
        for string in example_str_list:
            match string:
                case string if string.startswith('Input: '):
                    example_str_list2.append(string.replace('Input: ', ''))
                case string if string.startswith('Output: '):
                    example_str_list2.append(string.replace('Output: ', ''))
        examples_list_4_vars.append(example_str_list2)

    return main_text, examples_list_4_vars


main_text, examples_list_4_vars = main_text_split(task_content)

template = f'''import unittest
from time import perf_counter

t1 = perf_counter()

"""
{task_num}
{main_text}
"""


{task_code_func}
    return


print({task_code_func_name}({examples_list_4_vars[0][0]}))


# tests
class AllTests(unittest.TestCase):
'''

for i, example in enumerate(examples_list_4_vars):
    template += f'''
    def test0{i}_{task_code_func_name}(self):
        expected = {example[1]}
        actual = {task_code_func_name}({example[0]})
        self.assertEqual(expected, actual)
    '''

template += '''
t2 = perf_counter()
print(f'{t2 - t1:.5f} sec')
'''

with open(f'task{file_num}_{task_num}.py', 'w') as f:
    f.write(template)