forked from liangxiao1/mini_utils
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_parser.py
More file actions
97 lines (91 loc) · 3.65 KB
/
html_parser.py
File metadata and controls
97 lines (91 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
'''
github : https://github.com/liangxiao1/mini_utils
This tool is for get element value from html.
'''
import json
from bs4 import BeautifulSoup
import os
import sys
from urllib import urlopen
import logging
import argparse
import string
import re
parser = argparse.ArgumentParser('Script for get info from html')
parser.add_argument('--url',dest='url',action='store',help='specify url',default=None,required=True)
parser.add_argument('--keyword',dest='keyword',action='store',help='specify keyword you are looking for',default=None,required=True)
parser.add_argument('--must_key',dest='must_key',action='store',help='must have keys',default=None,required=False)
parser.add_argument('--notkeyword',dest='notkeyword',action='store',help='specify keyword you are not looking for',default=None,required=False)
parser.add_argument('--tag',dest='tag',action='store',help='optional specify prefix tag, default is JOB_',default='JOB_',required=False)
parser.add_argument('--name',dest='name',action='store',help='optional specify suffix name, default is PKGURL',default='PKGURL',required=False)
parser.add_argument('--dir', dest='file_dir', action='store', default='/tmp',
help='optional, output location, default is /tmp', required=False)
args=parser.parse_args()
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,format="%(levelname)s:%(message)s")
url = args.url
keywords = args.keyword
must_keys = args.must_key
notkeywords = args.notkeyword
tag = args.tag
name = args.name
url_fh = urlopen(url)
html_src = url_fh.read()
with open('arch_taskinfo.txt','w') as fh:
fh.writelines(html_src)
soup = BeautifulSoup(html_src,'lxml')
log.info("loading done")
JOB_DIR = args.file_dir
JOB_ENV_YAML = JOB_DIR+"/job_env.yaml"
JOB_ENV_TXT = JOB_DIR+"/job_env.txt"
results=[]
#s=soup.findAll('a',href=True)
#log.info(s)
#s=soup.findAll(string=re.compile("Package Name"),href=False)
#log.info(s)
#def walk_soup(soup):
# if hasattr(soup,'children'):
# for child in soup.children:
# log.info(" %s",child.string)
# #log.info(" %s",child)
# if hasattr(child,'a'):
# log.info("xiliang %s",child.a)
# walk_soup(child)
# else:
# log.info("xiliang2 %s",soup)
#walk_soup(soup)
#s=soup.findAll('tr',href=False)
#for i in s:
# log.info(i.get_text())
# for child in i.children:
# log.info(" %s",child.string)
s=soup.findAll('a',href=True)
for i in s:
#log.info("name: %s, url: %s",i.get_text(), i['href'])
if notkeywords == None:
if keywords != None:
for keyword in keywords.split(','):
if re.match('.*'+keyword+'.*', i['href']) != None:
log.info("found %s", i['href'])
results.append(i['href'])
else:
check_notkey = True
for notkeyword in notkeywords.split(','):
if re.match('.*'+notkeyword+'.*', i['href']) != None :
check_notkey = False
if must_keys != None:
for must_key in must_keys.split(','):
if re.match('.*'+must_key+'.*', i['href']) == None :
check_notkey = False
if keywords != None:
for keyword in keywords.split(','):
if re.match('.*'+keyword+'.*', i['href']) != None and check_notkey:
log.info("found %s", i['href'])
results.append(i['href'])
with open(JOB_ENV_YAML, 'a') as fh:
fh.write("%s%s: %s\n"% (tag, name, ','.join(results)))
log.info("Write to %s", JOB_ENV_YAML)
with open(JOB_ENV_TXT, 'a') as fh:
fh.write("%s%s=%s\n"% (tag, name,','.join(results)))
log.info("Write to %s", JOB_ENV_TXT)