前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:
#-*- coding:utf-8 -*-
import BeautifulSoup
import urllib2
import json
import cookielib
class MyParser:
def __init__(self,seedurl,destpath,stop_file_path):
self.seedurl=seedurl
self.stop_file_path=stop_file_path
stop_file=open(stop_file_path,"rb")
splits=stop_file.readline().split("\t")
self.no_0=splits[0] #stop文件里的值:初始为0
self.no_1=splits[1] #当前页第几个物品
self.no_2=splits[2] #当前物品第几个记录
self.destpath=destpath
def run(self):
print self.no_0
while int(self.no_0)<5*44:
self.seedurl=self.seedurl+str(self.no_0)
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
req=urllib2.Request(url=self.seedurl,headers=headers)
content=urllib2.urlopen(req).read()
contentsoup=BeautifulSoup.BeautifulSoup(content)
items=contentsoup.findAll("div",{"class":"col title"})
out_file=open(self.destpath,"a+")
for item in items:
print item.find("a")["href"]
out_file.write(item.find("a")["href"]+"\n")
out_file.flush()
out_file.close()
self.no_0=int(self.no_0)+44
print "ok"
def run():
seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s="
item_stop_file="e://item_stop_file"
record_stop_file="s://record_stop_file"
outFile="e://out"
myParser=MyParser(seedurl,outFile,item_stop_file)
myParser.run()
if __name__=="__main__":
run()
print "done!"
这样得到了输出文件e://out ,每行是一个商品的链接。
下面根据上面爬到的文件,爬取每个商品的交易记录,代码如下:
#-*- coding:utf-8 -*-
'''
Created on 2014��7��23��
@author: sj
'''
import re
import BeautifulSoup
import os
import urllib2
class MyParser:
def __init__(self,item_path_file,stop_file,out_file):
self.item_path_file=item_path_file
self.stop_file=stop_file
self.out_file=out_file
stop_object=open(self.stop_file,"rb")
splits=stop_object.readline().split("\t")
stop_object.close()
self.item=splits[0]
self.page=splits[1]
self.record=splits[2]
self.tag=0
def run(self):
print self.item
print self.page
print self.record
item_object=open(self.item_path_file,"rb")
num_items=len(item_object.readlines())
item_object.close()
item_object=open(self.item_path_file,"rb")
for line in item_object.readlines()[int(self.item):num_items]:
try:
if re.search("tmall",line):
stop_object=open(self.stop_file,"rb")
item_new=stop_object.readline().split("\t")[0]
stop_object.close()
stop_object=open(self.stop_file,"wb")
stop_object.write(item_new+"\t"+"0"+"\t"+"0"+"\n")
stop_object.flush()
stop_object.close()
continue
print line
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
req=urllib2.Request(url=line,headers=headers)
content=urllib2.urlopen(req,timeout=3).read()
contentSoup=BeautifulSoup.BeautifulSoup(content)
data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"]
parameters=data_api.split("?")[1]
stop_object=open(self.stop_file,"rb")
bid_page=stop_object.readline().split("\t")[1]
stop_object.close()
page_size=int(parameters.split("&")[2].split("=")[1])
while int(bid_page)<int(page_size):
print "没有超过pagesize的大小..."
print bid_page
if self.tag==1:
data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1))
else:
data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1))
data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload"
req=urllib2.Request(url=data_url,headers=headers)
datacontent=urllib2.urlopen(req,timeout=3).read()
datacontent=datacontent.decode("gbk").encode("utf-8")
self.deal(datacontent)
bid_page=int(bid_page)+1
stop_object=open(self.stop_file,"wb")
stop_object.write(str(self.item)+"\t"+str(bid_page)+"\t"+"0")
stop_object.flush()
stop_object.close()
self.tag=1
print self.item
if int(bid_page)>=page_size:
print "超过page_size大小,保存下一个物品的行数 0 0"
stop_object=open(self.stop_file,"wb")
stop_object.write(str(int(self.item)+1)+"\t0\t0\n")
stop_object.close()
self.item=int(self.item)+1
except Exception as e:
if e=="timed out":
continue
def deal(self,content):
ls=[m.start() for m in re.finditer("\"",content)]
content=content[(ls[0]+1):ls[-3]]
contentSoup=BeautifulSoup.BeautifulSoup(content)
recordshtml=contentSoup.find("tbody")
if recordshtml==None:
return
recordshtml=recordshtml.findAll("tr")
for record in recordshtml:
cols=record.findAll("td")
if len(cols)!=5:
continue
name=cols[0].text
price_em=cols[1].findAll("em")
price=price_em[-1].text
num=cols[2].text
time=cols[3].text
type=cols[4].text
line=name+"\t"+price+"\t"+num+"\t"+time+"\t"+type+"\n"
print line
out_object=open(self.out_file,"a+")
out_object.write(line)
out_object.flush()
out_object.close()
print "ok"
def run():
item_path_file="e:/item_path_file"
stop_file="e://stop_file"
out_file="e://records_file"
parser=MyParser(item_path_file,stop_file,out_file)
parser.run()
if __name__=="__main__":
run()
print "done~"
这里item_path_file 就是第一步爬取到的商品链接文件,stop_file用于记录爬取到的位置,其实不记录也可以,上面程序没有记录爬取失败数据文件。
注意,这里可能会爬取到天猫上的物品,但是天猫的交易记录和淘宝的格式不一样,所以这里直接过滤掉天猫的。
这次爬数据比之前进步的地方:
try except的使用,之前没有用,每次超时还要手动把程序停掉,然后再开启,从断点处爬,try except 的使用使得超时就跳过本链接,这样少了很多人工操作。
后来得知自己都是手动爬的,还有一种scrapy框架比较简单些。