昨天投简历的时候,有个HR给了我个网站,http://www.dg.gov.cn/machong/zfcg/list.shtml
大致意思我也没懂,聊天也不回我,我只能按自己的理解尝试去抓点数据,看看有没有戏,找实习找了半个多月,毫无进展。
下面是这个网站的代码:
from lxml import etree
import requests
import pymysql
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) "
"AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36",
}
base_url = 'http://www.dg.gov.cn'
#连接数据库
db = pymysql.connect(host = "localhost" , user = "root" , password = "123",
database="kaijianboss",port=3306,charset='utf8')
#获取游标
cursor = db.cursor()
#处理列表页,拿到详情页的链接
def parse_list(url):
#xpath解析
resp = requests.get(url,headers)
text = etree.HTML(resp.text)
# print(text)
#获取网页中的短连接,该链接没有域名
urls = text.xpath("//div[contains(@class,'list-right_title')]/a/@href")
for url in urls:
#添加域名,构造真实链接
t_url = base_url + url
# print(t_url)
#传入解析函数,获取数据
parse_detail(t_url)
#处理详情页,获取所需数据
def parse_detail(url):
#用于存储数据的字典
info = {}
resp = requests.get(url,headers)
text = etree.HTML(resp.text)
#获取项目标题
title = text.xpath("//div[contains(@class,'title')]//ucaptitle/text()")
#获取内容
contents ="".join(text.xpath("//div[contains(@class,'con_cen')]//p//text()"))
#获取时间
time = "".join(text.xpath("//publishtime/text()")).strip()
#获取类型
cate = text.xpath("//div[contains(@class,'con_cen')]//p")[0]
if cate == '结果公布':
bidding = "已结束"
else:
bidding = "投标中"
#获取网址
url = url
#获取单位名称
company = "东莞市麻涌镇招投标服务所"
#获取附件,构造附件的链接
f_accessory = "".join(text.xpath("//a[@type='file']/@href"))
s = url.split("/")
# print(s)
s_url = s[0] + "/" + s[1] + "/" + s[2] + "/" + s[3] + "/" + s[4] + "/" + s[5]
# print(s_url)
if f_accessory:
accessory = s_url +"/" + f_accessory
# print(accessory)
else:
accessory = "无附件"
info['title'] = title
info['contents'] = contents
info['time'] = time
info['bidding'] = bidding
info['url'] = url
info['company'] = company
info['accessory'] = accessory
save_to_sql(info)
#构造url,传递给列表页方法
def main():
for x in range(2,73):
url = "http://www.dg.gov.cn/machong/zfcg/list_{}.shtml".format(x)
parse_list(url)
#将数据存储到MySQL
def save_to_sql(info):
#sql语句
sql = """
insert into dgmy(id, title, contents, time, bidding, url, company, accessoryurl)
values(null, %s,%s,%s,%s,%s,%s,%s)
"""
title = info['title']
contents = info['contents']
time = info['time']
bidding = info['bidding']
url = info['url']
company = info['company']
accessoryurl = info['accessory']
try:
cursor.execute(sql, (title,contents,time,bidding,url,company,accessoryurl))
db.commit()
print("sql存储成功")
except Exception as e:
print("Error:", e.args)
if __name__ == '__main__':
main()
简单就说一下思路,其实这个网站还挺好爬的,可以用框架,也可以不用,这里没用是觉得,它的链接很好理解和构造,很简单的循环就可以完成。
这里比较难的困难就是,获取附件的链接,需要多注意一下下载链接的来源逻辑。
网站1000+数据全都拿到了。
数据库:
还有第二页。不过多展示。
顺便记录一下报错:向数据库插入数据时,报错如下:
", 105, 130, 'ordinal not in range(256)'
不知道为啥,解决为,连接数据库时加入该参数:
charset='utf8'
估计这个还是凉了,两天了都不带理人的,哎,难搞哦。