import requests
import csv
import os
from lxml import etree
def getHtml(name, page):
url = "https://search.jd.com/Search?"
parmas = {"keyword": name, "enc": "utf-8", "page": page}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
try:
res = requests.get(url=url, params=parmas, headers=headers)
res.encoding = res.apparent_encoding
return res.text
except Exception as e:
print(e)
def getdiv(html):
xpl = etree.HTML(html) # 解析网页,构造xpath对象
divs = xpl.xpath("//div[@class='gl-i-wrap']")
return divs
def getTitle(divs):
for i in divs:
print("-------------------------------------------------------------")
print(i.xpath('div[@class="p-name p-name-type-2"]//em/text()'))
data = i.xpath('div[@class="p-name p-name-type-2"]//em/text()')
for data1 in data:
index = data1.find(" ")
dict1 = {"商品名称": data1[0:index], "描述": data1[index + 1:]}
price = i.xpath('div[@class="p-price"]//i/text()')
shop = i.xpath('div[@class="p-shop"]//a/text()')
for data2, data3 in zip(price, shop):
dict2 = {"价格": data2, "店铺": data3}
print("评论数:", i.xpath('div[@class="p-commit"]//a'), "条评价")
# for item in i.xpath('div[@class="p-commit"]/strong'):
# print(etree.tostring(item).decode("utf-8"))
dic = {}
dic.update(dict1)
dic.update(dict2)
imgs = i.xpath('div[@class="p-img"]//img[@source-data-lazy-img]') # @[0].get("source-data-lazy-img")
for index in range(0, len(imgs)):
img = imgs[index].get("source-data-lazy-img")
dict3 = {"图片链接": img}
# print(img)
dic.update(dict3)
with open("jd.csv", "a", encoding="utf8", newline="")as f:
title = ["商品名称", "描述", "价格", "店铺", "图片链接"]
dictwri = csv.DictWriter(f, title)
if os.path.getsize('jd.csv') == 0:
dictwri.writeheader()
dictwri.writerow(dic)
if __name__ == "__main__":
for i in range(1, 100):
html = getHtml("宠物", str(i))
divs = getdiv(html)
getTitle(divs)
运算符 |
描述 |
实例 |
返回值 |
or |