php抓捕1688商品详情,python 抓取 1688商品详情

最新推荐文章于 2024-06-25 11:38:31 发布

weixin_39968490

最新推荐文章于 2024-06-25 11:38:31 发布

阅读量422

点赞数

文章标签： php抓捕1688商品详情

写的还是不完善，抓取的价格还稳定，还有详情现在无法抓到

#!/usr/bin/python

import threading

from time import ctime,sleep

import pycurl

import urllib2

import sys,os

import StringIO

from lxml import etree

import datetime

starttime = datetime.datetime.now()

#https pycurl

def spider_curl(url):

c = pycurl.Curl()

c.setopt(pycurl.URL, url)

b = StringIO.StringIO()

c.setopt(pycurl.WRITEFUNCTION, b.write)

c.setopt(pycurl.SSL_VERIFYPEER, 0)

c.setopt(pycurl.SSL_VERIFYHOST, 0)

c.perform()

html = b.getvalue()

#print html

show_pach( html,url)

def show_pach(html,url):

tree=etree.HTML(html)

nodes=tree.xpath(u"/html/body")

title=nodes[0].xpath("//title")

attach_thumb = nodes[0].xpath('//li[@data-imgs]')

sale = nodes[0].xpath('//span[@class="value"][2]');

img_s = ""

cover = ""

cost = ""

sale_price = ""

market_price = ""

shop_price = 100 #库存

#print sale[5]

for items in sale:

cost = items.text

sale_price = float(cost) * 1.4

market_price = float(cost) * 1.8

j = 0

for item in attach_thumb:

imgs = item.attrib['data-imgs']

dict = eval(item.attrib['data-imgs'])

if j == 1:

cover = str(dict["preview"])

#print dict["preview"]

img_s = img_s + str(dict["preview"])+","

j = j + 1

if j == 5:

break

for item in title:

title = item.text[:-11]

print title +"\n"

print cover +"\n"

print img_s[:-1] +"\n"

sql = "INSERT INTO `wpin`.`yge_product` ( `title`, `category_id`, `attach_thumb`,`attach_image`,`slider`,`sale_price`,`market_price`,`shop_price`,`chengben`, `content`) VALUES ('"+ title +"','163','"+ cover +"','"+ cover +"','"+ img_s[:-1] +"','"+ str(sale_price) +"','"+ str(market_price) +"','"+ str(shop_price) +"','"+ str(cost) +"','content')"

print sql

def download_img_https(url):

c = pycurl.Curl()

c.setopt(pycurl.URL, url)

b = StringIO.StringIO()

c.setopt(pycurl.WRITEFUNCTION, b.write)

c.setopt(pycurl.SSL_VERIFYPEER, 0)

c.setopt(pycurl.SSL_VERIFYHOST, 0)

c.perform()

html = b.getvalue()

#print html

ms=hashlib.md5()

ms.update(url)

ms.hexdigest()

fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")

fk.write(html)

print url

fk.close()

def urllibget(i):

response = urllib2.urlopen(i)

html = response.read()

show_pach(html,i)

def run():

url = raw_input("add one url: ")

if url[:4] != 'http':

print "please a true 1688 detail url "

else:

urllibget(url)

run()

endtime = datetime.datetime.now()

print (endtime - starttime).seconds

weixin_39968490

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
php抓捕1688商品详情,python 抓取 1688商品详情

写的还是不完善，抓取的价格还稳定，还有详情现在无法抓到#!/usr/bin/pythonimport threadingfrom time import ctime,sleepimport pycurlimport urllib2import sys,osimport StringIOfrom lxml import etreeimport datetimestarttime = datetime...
复制链接

扫一扫