最近搞项目缺数据,随便写了一个爬虫,不知道好不好用,拿来分享一下,希望各位大佬提供一下指导。商品评论数量(QuantitySold)这个数据不知道为什么始终获取值为空,我这里自己采用的随机数
import time
import random
import requests
from lxml import etree
import pymysql
from DBUtils.PooledDB import PooledDB
db_pool = PooledDB(
creator=pymysql, # 使用 PyMySQL 作为数据库连接库
mincached=1, # 最小空闲连接数
maxcached=20, # 最大空闲连接数
maxconnections=20, # 最大连接数
host='localhost',
user='root',
password='',
database='shopping',
charset='utf8mb4'
)
# 插入数据
def insert_data(data):
conn = db_pool.connection()
cursor = conn.cursor()
try:
# 执行插入操作
cursor.execute("INSERT INTO products (shopname, name, type, QuantitySold, price, description, images) VALUES (%s, %s, %s, %s, %s, %s, %s)", data)
# 提交事务
conn.commit()
except Exception as e:
print("Error inserting data:", e)
# 回滚事务
conn.rollback()
finally:
# 关闭游标和连接
cursor.close()
conn.close()
head = {
"Cookie":"",
"User-Agent":""
}
def main(shoptitle, page):
url = f"https://search.jd.com/Search?keyword=${shoptitle}&wq=${shoptitle}&pvid=8858151673f941e9b1a4d2c7214b2b52&isList=0&page=${page}"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
shoppingName = et.xpath("//*[@id='J_goodsList']/ul/li/div/div/span/a/text()")
title = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/@title")
img = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/img/@data-lazy-img")
price = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()")
Qun = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-commit']/strong/text()")
# print(etree.tostring(item, encoding="unicode"))//*[@id="J_comment_100091072389"]
xin = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/@href")
numurl = []
lianjie =[]
for item in img:
numurl.append("https:"+item)
for item in xin:
lianjie.append("https:" + item)
time.sleep(1)
for index in range(len(shoppingName)):
random_number = random.randint(100, 500000)
paqu(lianjie[index], shoppingName[index], title[index], shoptitle, random_number, price[index], numurl[index])
def paqu(Url, shoppingName, title, shoptitle, random_number, price, img):
resp = requests.get(Url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
num = et.xpath("//*[@id='detail']/div[2]/div[1]/div[1]/ul[3]/li/text()")
lianjie = ""
for item in num:
lianjie += item+","
print(img)
data_to_insert = (shoppingName, title, shoptitle, random_number, price, lianjie, img)
insert_data(data_to_insert)
shoptitle = ""
if __name__ == '__main__':
shoptitle = input("请输入你要爬取的商品类型")
#自己定义要爬取商品多少页
for cx in range(20):
main(shoptitle, cx+2)