import requests
from lxml import etree
import time
import pymysql
import datetime
import urllib
# encoding:utf-8
conn = pymysql.connect(
host="localhost",
user="root",
port=3306,
password="123456",
database="jizhang")
gg=1996
def dbservice(conn,username,sp,image,pr):
cursor = conn.cursor()
global gg
created=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
created2=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
num=512
cid=319
image1="http:"+image
image="http://localhost:8089/pic/phone/"+str(gg)+".jpg"
print(image1)
urllib.request.urlretrieve(image1,'e:/JDpic/'+str(gg)+'.jpg')
print("第"+str(gg)+"条")
print(username)
cursor.execute("INSERT INTO tb_item(title,id,sell_point,price,num,image,cid,created,updated) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s);", [username,gg,sp,pr,num,image,cid,created,created2])
conn.commit()
cursor.close()
gg=gg+1
def crow_first(n):
url='https://search.jd.com/Search?keyword=%E8%A1%A3%E6%9C%8D&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page='+str(n)+'&s='+str(1+(n-1)*30)+'&click=0&scrolling=y'
#衣服
head = {'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
}
r = requests.get(url, headers=head)
r.encoding='utf-8'
html = r.content
html1 = etree.HTML(html)
datas=html1.xpath('//li[contains(@class,"gl-item")]')
s2=0
for data in datas:
s2=s2+1;
p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')
p_comment = data.xpath('div/div[@class="p-commit"]/strong/a/text()')
p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em/text()')
p_sell_point = data.xpath('div/div[@class="p-name p-name-type-2"]/a/i/text()')
p_image= data.xpath("div[@class='gl-i-wrap']/div[@class='p-img']/a/img/@source-data-lazy-img")
dbservice(conn,"".join(p_name),"".join(p_sell_point),"".join(p_image),"".join(p_price))
if len(p_price) == 0:
p_price = data.xpath('div/div[@class="p-price"]/strong/@data-price')
print(s2)
if __name__=='__main__':
for i in range(1,100):
print('***************************************************')
try:
print(' First_Page: ' + str(i))
crow_first(i)
print(' Finish')
except Exception as e:
print(e)
print('------------------')
conn.close()
爬取价格名字和卖点。
告诉大家一个谷歌浏览器插件对于经常爬东西的人很有用,xpath helper