Python爬虫Demo
import json
import os
import pymysql
import requests
from lxml import etree
import time
import csv
# 定义函数抓取
def crow_first(a):
# 构造每一页的url变化
url = 'https://*******/portal_server/lawyer/query/'+str(a or '')
r = requests.get(url)
# 指定编码方式,不然会出现乱码
r.encoding = 'utf-8'
# 将 Python 字典类型转换为 JSON 对象
json_str = json.dumps(r.json())
# 将 JSON 对象类型转换为 Python 字典
user_dic = json.loads(json_str)
a=str(a or '')
nameStr=str(user_dic['data']['name'] or '')
licenseNumber=str(user_dic['data']['licenseNumber']or '')
description=str(user_dic['data']['description']or '')
phone=str(user_dic['data']['phone']or '')
photo=str(user_dic['data']['photo']['name']or '')
config = {
'host': '127.0.0.1'
, 'user': 'root'
, 'password': 'root'
, 'database': 'test'
, 'charset': 'utf8'
, 'port': 3306 # 注意端口为int 而不是str
}
db = pymysql.connect(**config)
cursor = db.cursor()
try:
db.select_db("test")
sql = "INSERT INTO test.cdtf_data(id,name,licenseNumber,description,phone,photo)VALUES(%s,%s,%s,%s,%s,%s)"
cursor.execute(sql, (a, nameStr, licenseNumber, description,phone,photo))
db.commit()
print('插入数据成功')
except Exception as e:
db.rollback()
print("插入数据失败")
print('Failed:', e)
cursor.close()
db.close()
time.sleep(3)
#图片保存
url = "https://cld.cdtf.gov.cn/attachment_server/"+photo
root = "d://pics//";
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("图片保存cg")
else:
print("保存失败")
except:
print('爬取失败')
# 定义函数抓取
def crow_list(a):
# 分页查询
url = 'https://******/portal_server/lawyer/query/page?pageNum='+str(a or '')
r = requests.get(url)
# 指定编码方式,不然会出现乱码
r.encoding = 'utf-8'
# 将 Python 字典类型转换为 JSON 对象
json_str = json.dumps(r.json())
# 将 JSON 对象类型转换为 Python 字典
user_dic = json.loads(json_str)
print("------当前页数为" + str(a));
list=user_dic['data']['records']
for user in list:
print('当前律师ID:'+user['id'])
crow_first(user['id'])
if __name__ == '__main__':
for i in range(150, 2399):#2399为每页十条的总页数。
crow_list(i)