《2018年7月15日》【连续286天】
标题:xpath练习,京东商品列表爬取;
内容:
昨天的网站看来一下,发现源代码都是js的,直接按f12了,就不能用xpath了。
今天尝试了一下用xpath爬取京东的商品搜索页面,还有不少问题:
import requests
from urllib.parse import quote
from lxml import etree
from requests.exceptions import RequestException
def find_goods(page,s,keyword):
try:
url ='https://search.jd.com/Search?keyword='+ \
quote(keyword) +'&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=s%27biao&page='\
+str(page)+"&s="+str(s)+'&click=0'
headers ={'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
response = requests.get(url, headers =headers)
response.encoding="utf-8"
if response.status_code == 200:
return response
return None
except RequestException:
return None
def print_goods(r):
tplt = "{:30}\t¥{:6}"
html =etree.HTML(r.text)
#print(r.text)
text='//li[@class="gl-item"]/div[@class="gl-i-wrap"]'
r1 =html.xpath(text+ '//div[@class="p-img"]/a/@title')
r2 =html.xpath(text+ '//div[@class="p-price"]//i/text()')
for i in range(len(r1)):
print(tplt.format(r1[i],r2[i]))
def main(keyword):
page =1
s=1
keyword=keyword
re =find_goods(page,s,keyword)
print_goods(re)
if __name__ == "__main__":
keyword="手表"
main(keyword)
明天在修改修改;