废话不多说,直接上代码
今天倒霉的网站是惠农网供应大厅
import requests
import pymysql
from bs4 import BeautifulSoup # 用来解析网页
from fake_useragent import UserAgent
import uuid
import time
cookk = {
'Cookie': 'sessionId=S_0KHA8RH66NM124OL; Hm_lvt_91cf34f62b9bedb16460ca36cf192f4c=1604907974,1604908122; Hm_lpvt_91cf34f63a8ff527100d30; lmvid.sig=Si_wW_olbim0OMvL1hiezOhV1QMMdZFpUaz_-YYY93w; deviceId=05ac3ef-5c72-405f-be76-83e97e186; hnUserTicket=d67ed8dc-df8a-45fe-85cd-1530efae920d; hnUserId=125807455'}
types = ["sgzw", "sczw", "qcrd", "shuic", "nfjg", "lymm", "zzzm", "mmhc", "nznj", "zyc", "tudi", "package", "fffu"]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
'Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
pUrl = "https://www.cnhnb.com"
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset="utf8mb4")
cur = conn.cursor()
print("连接成功")
for tt in types:
for i in range(1, 2): # 爬取第一页到第3页的数据
headers["User-Agent"] = UserAgent().random
resp = requests.get(f"https://www.cnhnb.com/p/{tt}-0-0-0-0-{i}", headers=headers, cookies=cookk)
page_one = BeautifulSoup(resp.content, "html.parser")
# .find_all('div')[35].
dd = page_one.find_all("div", class_='product-items')
print("到这里了")
if dd is None:
print("要回去了")
continue
pp = 1
print(dd)
for ss in dd: # tr是每一行内容,在所有的行中,遍历每一列的内容
print(f"{pp}进入分类")
pp = pp + 1
v_type = ""
if tt == "sgzw":
v_type = "水果"
elif tt == "sczw":
v_type = "蔬菜"
elif tt == "qcrd":
v_type = "禽畜肉蛋"
elif tt == "shuic":
v_type = "水产"
elif tt == "nfjg":
v_type = "农副加工"
elif tt == "lymm":
v_type = "粮油米面"
elif tt == "zzzm":
v_type = "种子种苗"
elif tt == "mmhc":
v_type = "苗木花草"
elif tt == "nznj":
v_type = "农资农机"
elif tt == "zyc":
v_type = "中药材"
elif tt == "tudi":
v_type = "土地流转"
else:
v_type = "其他"
# id
productId = str(uuid.uuid1())
# 获取链接
sUrl = pUrl + ss.find('a')['href']
print(sUrl)
# 打开网址 获取时间和分类
resp_two = requests.get(sUrl, headers=headers, cookies=cookk)
page_two = BeautifulSoup(resp_two.content, "html.parser")
# 产地
place = page_two.find('span', class_="fs14 gray6").text.strip()
v_time = page_two.find("p", class_="update-time").text[5:]
# 价格
v_price = ss.find('span').text.strip()
# 品种名字
v_name = ss.find('div', class_='title').text.strip()[15:-1].split(" ")[0]
# 单位
aa = str(ss.find('div', class_='shops-price').text.strip())
unit = aa[-3:-1] + aa[-1]
# 标题
v_title = ss.find('div', class_='title').text.strip("优选").strip()
sql = "insert into gong_ying(id,v_name,v_price,v_unit,v_place,v_type,v_title,v_url,v_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cur.execute(sql, (productId, v_name, v_price, unit, place, v_type, v_title, sUrl, v_time))
print("sql已执行")
print("第{}页已结束".format(i))
conn.commit()
time.sleep(1) # 防止服务器蹦了,间隔一秒钟
cur.close()
conn.close()