本文实例讲述了Python实现爬取亚马逊数据并打印出Excel文件操作。分享给大家供大家参考,具体如下:
python大神们别喷,代码写的很粗糙,主要是完成功能,能够借鉴就看下吧,我是学java的,毕竟不是学python的,自己自学看了一点点python,望谅解。
#!/usr/bin/env python3
# encoding=UTF-8
import sys
import re
import urllib.request
import json
import time
import zlib
from html import unescape
import threading
import os
import xlwt
import math
import requests
#例如这里设置递归为一百万
sys.setrecursionlimit(1000000000)
##获取所有列别
def getProUrl():
urlList = [] headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
session = requests.Session()
furl="https://www.amazon.cn/?tag=baidu250-23&hvadid={creative}&ref=pz_ic_22fvxh4dwf_e&page="
for i in range(0,1):
html=""
html = session.post(furl+str(i),headers = headers)
html.encoding = 'utf-8'
s=html.text.encode('gb2312','ignore').decode('gb2312')
url=r'
'reg=re.compile(url,re.M)
name='"category" : "' + '(.*?)' + '"'
reg1=re.compile(name,re.S)
urlList = reg1.findall(html.text)
return urlList
##根据类别获取数据链接
def getUrlData(ci):
url="https://www.amazon.cn/s/ref=nb_sb_noss_2?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords="+ci+"&page=1&sort=review-rank"
return url
##定时任务,等待1秒在进行
def fun_timer():
time.sleep(3)
##根据链接进行查询每个类别的网页内容
def getProData(allUrlList):
webContentHtmlList = [] headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
for ci in allUrlList:
session = requests.Session()
fun_timer()
html = session.get(getUrlData(ci),headers = headers)
# 设置编码
html.encoding = 'utf-8'
html.text.encode('gb2312', 'ignore').decode('gb2312')
gxg = r'
'reg = re.compile(gxg, re.M)
items = reg.findall(html.text)
print(html.text)
webContentHtmlList.append(html.text)
return webContentHtmlList
##根据网页内容过滤需要的属性和值
def getProValue():
list1 = [] * 5
list2 = [] * 5
list3 = [] * 5
list4 = [] * 5
list5 = [] * 5
list6 = [] * 5
list7 = [] * 5
list8 = [] * 5
urlList = getProUrl();