由于最近自己在写个电商项目,需要大量的商品信息,故学习了下怎么爬取现有的商城商品信息。
爬取页面
# -*- coding: utf-8 -*-
import requests
import lxml.html
import json
def parse_url(xiaohua_url, headers):
response = requests.get(xiaohua_url, headers=headers)
return response.content.decode("gbk")
def get_data(html_content):
metree=lxml.html.etree
# 解析对象
parser=metree.HTML(html_content,metree.HTMLParser())
# 解析获得在当前校花中的所有信息
div_list=parser.xpath('//div[@id="search_nature_rg"]/ul[@class="bigimg cloth_shoplist"]/li')
# print(div_list)
result=[]
index=0
for element in div_list:
index+=1
item={
}
# item["top_title"]=element.xpath('./div[@class="goods-list-item c-goods J_pro_items"]/@id')
item["top_title"]=element.xpath('./a/@title')[0]
if index <= 8:
item["pict_src"]=element.xpath('./a/img/@src')[0]
if index>8:
item["pict_src"]=element.xpath('./a/img/@data-original')[0]
item["price"] = element.xpath('./p[@class="price"]/span[@class=