php抓捕1688商品详情,python 抓取 1688商品详情

写的还是不完善,抓取的价格还稳定,还有详情现在无法抓到

#!/usr/bin/python

import threading

from time import ctime,sleep

import pycurl

import urllib2

import sys,os

import StringIO

from lxml import etree

import datetime

starttime = datetime.datetime.now()

#https pycurl

def spider_curl(url):

c = pycurl.Curl()

c.setopt(pycurl.URL, url)

b = StringIO.StringIO()

c.setopt(pycurl.WRITEFUNCTION, b.write)

c.setopt(pycurl.SSL_VERIFYPEER, 0)

c.setopt(pycurl.SSL_VERIFYHOST, 0)

c.perform()

html = b.getvalue()

#print html

show_pach( html,url)

def show_pach(html,url):

tree=etree.HTML(html)

nodes=tree.xpath(u"/html/body")

title=nodes[0].xpath("//title")

attach_thumb = nodes[0].xpath('//li[@data-imgs]')

sale = nodes[0].xpath('//span[@class="value"][2]');

img_s = ""

cover = ""

cost = ""

sale_price = ""

market_price = ""

shop_price = 100 #库存

#print sale[5]

for items in sale:

cost = items.text

sale_price = float(cost) * 1.4

market_price = float(cost) * 1.8

j = 0

for item in attach_thumb:

imgs = item.attrib['data-imgs']

dict = eval(item.attrib['data-imgs'])

if j == 1:

cover = str(dict["preview"])

#print dict["preview"]

img_s = img_s + str(dict["preview"])+","

j = j + 1

if j == 5:

break

for item in title:

title = item.text[:-11]

print title +"\n"

print cover +"\n"

print img_s[:-1] +"\n"

sql = "INSERT INTO `wpin`.`yge_product` ( `title`, `category_id`, `attach_thumb`,`attach_image`,`slider`,`sale_price`,`market_price`,`shop_price`,`chengben`, `content`) VALUES ('"+ title +"','163','"+ cover +"','"+ cover +"','"+ img_s[:-1] +"','"+ str(sale_price) +"','"+ str(market_price) +"','"+ str(shop_price) +"','"+ str(cost) +"','content')"

print sql

def download_img_https(url):

c = pycurl.Curl()

c.setopt(pycurl.URL, url)

b = StringIO.StringIO()

c.setopt(pycurl.WRITEFUNCTION, b.write)

c.setopt(pycurl.SSL_VERIFYPEER, 0)

c.setopt(pycurl.SSL_VERIFYHOST, 0)

c.perform()

html = b.getvalue()

#print html

ms=hashlib.md5()

ms.update(url)

ms.hexdigest()

fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")

fk.write(html)

print url

fk.close()

def urllibget(i):

response = urllib2.urlopen(i)

html = response.read()

show_pach(html,i)

def run():

url = raw_input("add one url: ")

if url[:4] != 'http':

print "please a true 1688 detail url "

else:

urllibget(url)

run()

endtime = datetime.datetime.now()

print (endtime - starttime).seconds

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值