import urllib.request
from bs4 import BeautifulSoup, SoupStrainer
import re
lis_str = []
def for_list(list_x):
for x in list_x:
if len(x) > 1:
for_list(x)
else:
# 过滤所有标签
pattern = re.compile("<(.*?)>")
lis_str.append(re.sub(pattern, "", str(x)))
if __name__ == '__main__':
url=''.strip()
reponse = urllib.request.urlopen(url)
only_a_tags = SoupStrainer('div', attrs={"class": "vF_detail_content_container"})
p_x_list = BeautifulSoup(reponse.read().decode('utf-8'), 'html.parser', parse_only=only_a_tags)
for_list(p_x_list)
count = len(lis_str)
filter_lis = ['采购', '联系', '中标', '电话', '传真', '成交']
for i in range(count):
if bool(re.compile(u'[\u4e00-\u9fa5]').search(lis_str[i])) or bool(re.search(r'\d', lis_str[i])):
if '采购' in lis_str[i] or '联系' in lis_str[i] or '中标' in lis_str[i] or '电话' in lis_str[i] or '传真' in lis_str[
i] or '成交' in lis_str[i]:
datas.append(lis_str[i])
爬虫
最新推荐文章于 2024-07-29 15:35:41 发布