为了整理一下以前的项目,将以前编写的爬虫程序上传。此项目为爬取淘宝上笔记本电脑价格、详细参数、销量等数据而编写。初期有效,但随后淘宝设立了相应反爬虫程序,此爬取方法已经失效。
此爬取过程分两步,其一是爬取搜索页面所有的商品与其链接地址
import re
import xlwt
import time
import requests
import pandas as pd
from retrying import retry
from concurrent.futures import ThreadPoolExecutor
start = time.clock()
plist = []
for i in range(1, 101):
j = 44 * (i-1)
plist.append(j)
listno = plist
datatmsp = pd.DataFrame(columns=[])
@retry(stop_max_attempt_number = 8)
def network_programming(num):
url = 'https://s.taobao.com/search?q=%E5%8F%A3%E7%BA%A2&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180724&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=' + str(num)
web = requests.get(url)
web.encoding = 'utf-8'
return web
def multithreading():
number = listno
event = []
with ThreadPoolExecutor(max_workers=10) as executor:
for result in executor.map(network_programming, number, chunksize=10):
event.append(result)
return event
listpg = []
event = multithreading()
for i in event:
json = re.findall(
'"auctions":(.*?),"recommendAuctions"', i.text)
if len(json):
table = pd.read_json(json[0])
datatmsp = pd.concat([datatmsp, table],
axis=0, ignore_index=True)
pg = re.findall(
'"pageNum":(.*?),"p4pbottom_up"', i.text)[0]
listpg.append(pg)
print(datatmsp)
print(listpg)
print(datatmsp)
datatmsp.to_excel('D:/python/data1.xls',index=False)
其二是在搜索原来地址后按地址爬取详细信息。
import xlrd
import xlwt
import os
import re
import pandas as pd
os.chdir('D:\python')
from requests_html import HTMLSession
session=HTMLSession()
import requests
workbook=xlrd.open_workbook(r'data_laptop7.27.xls')
sheet1=workbook.sheet_by_name('Sheet1')
cols=sheet1.col_values(13)
a=[]
datatmsp=pd.DataFrame(columns=[])
for i in range(1,2400):
url='https:'+str(cols[i])
r=requests.get(url)
r.encoding='utf-8'
json=re.findall('"data":(.*?),"pager"',r.text)
if len(json):
try:
medium=json[0]
del json
n=medium.find('"data":')
medium='['+medium[n+7:-1]+']'
table=pd.read_json(medium,typ='dataframe')
datatmsp=pd.concat([datatmsp,table],axis=0,ignore_index=True)
print(i)
except:
print(i)
print(datatmsp)
datatmsp.to_excel('D:/python/data_laptop_append8.25.xls',index=False
此代码略显繁琐,爬取速率较慢,为我写爬虫的首次尝试。