程序运行截图:
mysql代码:
create table htgs(
id int primary key,
cgdw varchar(2000),
cgmc varchar(2000),
zbdw varchar(2000),
htid varchar(2000),
htvalue varchar(2000),
zbgyskhbank varchar(2000),
zbgyskhzh varchar(2000),
hturl varchar(2000),
fbtime varchar(200)
);
python代码:
# 2019/7/5
import json
import random
import re
from urllib import parse
import requests
import pymysql
# 打开数据库连接
db = pymysql.connect(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
"""是否结束爬取(继增用)"""
over = False
# 通过地址获取数据
def get_data(url):
"""通过html获取页面内容"""
global over
URL = "http://www.ccgp-jiangxi.gov.cn/web/jyxx/002006/002006006/%i.html" % url
try:
respose = requests.get(URL)
except requests.exceptions.ConnectionError:
respose = requests.get(URL)
print("*" * 300)
print("开始爬取第%i页的政府采购合同公示数据!" % url)
# 获取合同公示内容
contents = re.findall(
r'<li class="ewb-list-node clearfix">.*?<a href="(.*?)" target="_blank" class="ewb-list-name">',
respose.text, re.S)
times = re.findall(
r'<span class="ewb-list-date">(.*?)</span>',
respose.text, re.S)
# print("\033[34m合同公示内容:%s" % str(contents))
print("合同公示数量:%s" % len(contents) + "条")
print("发布时间:%s" % str(times[0]))
print("*" * 300)
for temp in range(len(contents)):
# """过滤网页标签"""
# dr = re.compile(r'<[^>]+>', re.S)
# print(str(dr.sub('', time[0])