**
功能介绍
**
获取政府招标内容包含以下关键词,就提取该标书内容保存(本地文本)1,汽车采购2、汽车租赁3、公务车4、公务车租赁5、汽车协议供货6、汽车7、租赁
爬取网站
http://www.lxggzyjy.com/f/newtrade/annogoods/list?selectedProjectType=2
作者:
speed_zombie
版本信息:
python v3.7.4
运行:
python web_purchase.py
解析结果:
解析完成后会获取采购招标.txt文件**,主要运用BeautifulSoup来解析网页的数据,BeautifulSoup是款非常好用的解析插件
废话不多上代码
# coding=UTF-8
import requests
from bs4 import BeautifulSoup
host = "http://www.lxggzyjy.com"
url = 'http://www.lxggzyjy.com/f/newtrade/annogoods/getAnnoList' #政府采购
keys=["汽车采购","汽车租赁","公务车","公务车租赁","汽车协议供货","汽车","租赁"]
dir_root = "ceshi" #文件要存放的根目录
datas = {
"pageNo": 0,
"pageSize": 11120,
"tradeStatus": 0,
"prjpropertyid": "21,22,23,24",
# "tradeArea": 3025,
# "tradeArea": 3026,
# "tradeArea": 3027,
# "tradeArea": 3028,
# "tradeArea": 3029,
# "tradeArea": 3030,
# "tradeArea": 3031,
# "tradeArea": 3032,
"projectname": "",
"tabType":""
}
#获取url地址上的所有url
def getWebUrl(_url,_data):
print("解析网址中...")
res = requests.post(_url,_data)
soup = BeautifulSoup(res.text,"lxml")
news_a = soup.find_all('a')
sum_news = news_a
news_url = []
# 遍历news
for k in sum_news:
try:
if k.get('href') and k.get('href').find("/f/")>=0:
news_url.append(k.get('href'))
except AttributeError as e:
continue
return news_url
#数组去重方法
def noReaptArr(_score_arr):
new_arr = [] # 创建一个新的数组来存储无重复元素的数组
for element in _score_arr:
if (element not in new_arr):
new_arr.append(element)
return new_arr
#查找每个政府采购网页的关键字
def findKeyWord(_url_arr):
all_pro=""
for k in _url_arr:
res = requests.get(host + k)
soup = BeautifulSoup(res.text, "lxml")
news_a = soup.find_all('h2')
str_pro = news_a[0].string
print("分析项目《"+str_pro+"》中")
for key in keys:
if res.text.find(key)>=0:
print("发现目标=============================="+key)
all_pro+="项目名称:<<"+str_pro+">>----->出现了"+key+"\n"
break;
# else:
# print("meiiyou")
if len(all_pro)>0:
return all_pro
else:
return "暂无数据"
#创建文件
def mkfile(_filename,_code):
f = open(_filename, "w",encoding='utf-8')
f.write(_code)
f.close
#
all_url = getWebUrl(url,datas)
no_reapt_url_arr = noReaptArr(all_url)
print("\n共有"+str(len(no_reapt_url_arr))+"个项目需要分析\n其中有重复采购招标地址"+str(len(all_url)-len(no_reapt_url_arr))+"已经忽略\n")
projects = findKeyWord(no_reapt_url_arr)
mkfile("采购招标.txt",projects)
print("解析完成")