前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:
#-*- coding:utf-8 -*-
import BeautifulSoup
import urllib2
import json
import cookielib
class MyParser:
def __init__(self,seedurl,destpath,stop_file_path):
self.seedurl=seedurl
self.stop_file_path=stop_file_path
stop_file=open(stop_file_path,"rb")
splits=stop_file.readline().split("\t")
self.no_0=splits[0] #stop文件里的值:初始为0
self.no_1=splits[1] #当前页第几个物品
self.no_2=splits[2] #当前物品第几个记录
self.destpath=destpath
def run(self):
print self.no_0
while int(self.no_0)<5*44:
self.seedurl=self.seedurl+str(self.no_0)
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
req=urllib2.Request(url=self.seedurl,headers=headers)
content=urllib2.urlopen(req).read()
contentsoup=BeautifulSoup.BeautifulSoup(content)
items=contentsoup.findAll("div",{"class":"col title"})
out_file=open(self.destpath,"a+")
for item in items:
print item.find("a")["href"]
out_file.write(item.find("a")["href"]+"\n")
out_file.flush()
out_file.close()
self.no_0=int(self.no_0)+44
print "ok"
def run():
seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&am