提取200个网页
import re
from bs4 import BeautifulSoup
import os
import pandas as pd
root = r'C:\Users\hp\Desktop\20190923HTML'
dirs=os.listdir(root)
tenderer_all = []
for file in dirs:
path = os.path.join(root, file)
with open(path,'r') as f:
content = f.read()
soup = BeautifulSoup(content,'lxml')
result = soup.get_text()
try:
tenderer = re.findall('(?:招\\\n?标\\\n?人|招标单位|单位|采购人|采购方|项目单位|建设单位|比选人|采购单位)(?::|:)?\\\n?[^为].+?(?:办事处|有限公司|事业部|管理局|资源局|商城|体育馆|小学|管委会|大队|学院|大学|学校|管理局|执法局|医院|电视台|支队|政府|办公室|建设局|办事处|管理所|小组|中学|运输局|技术馆|图书馆|检查站|基地|税务局|兽医局|监理总站|宣传部|林业局|联合会|质监局|事务局|消防总队|信息化局|公证处|财政厅|养护站|研究室|检察院|幼儿园|民政局|分局|教育组|基建处|管理站|管理处|总站)',result)[0]
except Exception as e:
tenderer = 'None'
tenderer = tenderer.split(':')[-1].split('\n')[-1].split()[-1]
tenderer_all.append(tenderer)
sum(item.count('None') for item in tenderer_all )
tenderer_all
daili_all = []
for file in dirs:
path = os.path.join(root, file)
with open(path,'r') as f:
content = f.read()
soup = BeautifulSoup(content,'lxml')
result = soup.get_text()
try:
daili = re.findall('(?:招标机构|代理机构|磋商代理机构|集中采购机构|招标代理机构|采购代理机构|集采机构|招标代理|招标代理单位|招标代理人)(?::|:)?\\\n?[^为].+?(?:有限公司|有限责任公司|商城|办公室)',result)[0]
except Exception as e:
daili = 'None'
daili = daili.split(':')[-1].split('\n')[-1].split()[-1]
daili_all.append(daili)
sum(item.count('None') for item in daili_all )
daili_all
lianxi_all = []
for file in dirs:
path = os.path.join(root, file)
with open(path,'r') as f:
content = f.read()
soup = BeautifulSoup(content,'lxml')
result = soup.get_text()
try:
lianxi = re.findall('(?:联\\\n?系\\\n?人)(?::|:)?\\\n?[^。].+?(?:[\\u4e00-\\u9fa5]+)',result)[0]
except Exception as e:
lianxi = 'None'
lianxi = lianxi.split(':')[-1].split('\n')[-1].split()[-1]
lianxi_all.append(lianxi)
sum(item.count('None') for item in lianxi_all )
lianxi_all
phone_all = []
for file in dirs:
path = os.path.join(root, file)
with open(path,'r') as f:
content = f.read()
soup = BeautifulSoup(content,'lxml')
result = soup.get_text()
try:
phone = re.findall('(?:联\\\n?系\\\n?电\\\n?话|联系人|电话|话)(?::|:)?(\d{11}|0\d{3}-\d{8}|0\d{3}-\d{7}|0\d{2}-\d{8}|0\d{3}—\d{7}|\d{8}|\d{7})',result)[0]
except Exception as e:
phone = 'None'
phone = phone.split(':')[-1].split('\n')[-1].split()[-1]
phone_all.append(phone)
sum(item.count('None') for item in phone_all )
phone_all
dataframe = pd.DataFrame({
'招标人':tenderer_all,'代理机构':daili_all,'联系人':lianxi_all,'电话':phone_all})
dataframe.to_csv(r"C:\Users\hp\Desktop\data.csv",encoding = 'GB2312',sep=',')
优化代理机构
import re
from bs4 import BeautifulSoup
import os
import pandas as pd
root = r'C:\Users\Administrator\Desktop\100G网页'
dirs=os.listdir(root)
proxy_name_all = []
for file in dirs:
path = os.path.join(root, file)
with open(path,'r',encoding='utf_8_sig') as f:
content = f.read()
soup = BeautifulSoup(content,'lxml')
result = soup.get_text()
text = re.sub('\s| |\xa0| |\\r|\n|\\n|\r|\t|\\t','',result)
text=re.sub(