customPythonParser/BillParserTake2.py at master · deeptiboddapati/customPythonParser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import bs4, re


class html_cleanup():

  def __init__(self):
    self.text = None
    self.title = None


  def set_text(self, text):
    self.text = bs4.BeautifulSoup(text, 'lxml')


  def remove_empty_elements(self):
    tag_list = ['tr', 'td', 'u']

    for tag in tag_list:
      tag_elements = self.text(tag)

      # Removes the elements with just whitespace in them.
      for element in tag_elements:
        if element.getText().isspace():
          element.decompose()


  # def consolidate_tag_siblings_same_tagname(self, tag):
  #   for tag_element in self.text(tag):
  #     # Find out if tag has siblings.
  #     if tag_element.find_next_siblings():
  #       # If there are siblings see if their tag is the same
  #       for sibling in tag_element.find_next_siblings():
  #         # If the name is the the same
  #         # then remove the text and place inside tag.
  #         if sibling.name == tag_element.name:
  #           tag_element.append(sibling.extract().get_text())


  def remove_title(self):
    # self.title = re.sub(r'(\s\s+)', r' ',
    #   self.text.title.extract().get_text())
    self.title = self.text.title.extract().get_text()

    # Gets the second table in text which contains the bill.
    self.text = self.text('table')[-1].extract()

    # # Gets the first sentence of the bill.
    # for x in self.text('td'):
    #   chkstring = x.get_text() # TODO: Ask Mark if he wants the text extracted.
    #   b = re.search('[.]',chkstring)
    #   # Replaces weird spaces with normal spacing.
    #   self.title += re.sub(r'(\s\s+)',r' ', chkstring)
    #   if b is not None: # Stops the loop when a period is found.
    #     break


  def remove_space(self, tag):
    tag_element = self.text(tag)[0]

    for element in (list(tag_element.next_siblings) +\
      list(tag_element.previous_siblings)):
        if type(element) == bs4.element.NavigableString:
          element.extract()


  def consolidate_tag_elements(self, rem, tag):

    def rm_tag_str_parent(tag):
      for tag_element in self.text(tag):
        tag_element.unwrap()


    def rm_tag_str_dif_sib(tag):
      # Removes tr tags and puts their strings into other tags.
      for tag_element in self.text(tag):
        # first_element = True

        if tag_element.previous_sibling:# and \
          # tag_element.previous_sibling.name != tag:
          # first_element = False
          previous_sibling = tag_element.previous_sibling
          tag_element.extract()
          string_list = list(tag_element.contents)
          for string in string_list:
            previous_sibling.append(string)

        # if first_element:
        else:
          tag_element.name = 'p'


    if rem == 'parent':
      rm_tag_str_parent(tag)
    elif rem == 'sibling different name':
      rm_tag_str_dif_sib(tag)


  def add_tags(self, tag):
    sec = re.compile("\A(SECTION)")
    sec1 =  re.compile("\A(Sec\.)")
    sec3 = re.compile("\A(SUBCHAPTER)")
    sec4 = re.compile("\A(\(.\))(?!\s?\(.\))")

    regcheck = [
      [sec, 'SECTION', "p"],
      [sec1, "Sec", 'p'],
      [sec3, "Subchapter", "p"],
      [sec4, "list", "li"]]

    # Find each expression and add the classes in the set to it.
    for x in regcheck:
      for tag_element in self.text(tag):
        #check what the first word is
        if re.search(x[0], list(tag_element.stripped_strings)[0]):
          tag_element.name = x[2]
          tag_element['class'] = x[1]


  def remove_allspaces(self):
    self.output = str(self.text)
    # self.output = self.text.prettify()
    self.output = str(self.text).split()
    self.output = ' '.join(self.output)
    # self.output = re.sub(r'\{.+\}\s*', '', self.output)


def htmltext(text):
  ttobj = html_cleanup()
  # Set text
  ttobj.set_text(text)
  # Remove empty elements
  # print(ttobj.text.name)
  ttobj.remove_empty_elements()
  # Consolidate tag_elements, siblings w/ same tag_name: 'u'
  # ttobj.consolidate_tag_siblings_same_tagname('u')
  # Consolidate tag parent td
  ttobj.consolidate_tag_elements('parent', 'td')
  # Remove title
  ttobj.remove_title()
  # Remove space
  ttobj.remove_space('tr')
  # Add tags
  ttobj.add_tags('tr')
  # Consolidate tag_elements, siblings w/ different tag_name than: 'tr'
  ttobj.consolidate_tag_elements('sibling different name', 'tr')
  ttobj.remove_allspaces()
  return ttobj.output