python抓取网页信息保存为xml文件_Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地...

Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地

来源:中文源码网    浏览: 次    日期:2018年9月2日

Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地 本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python

# -*- coding: gbk -*-

#Spider.py

import urllib2

import httplib

import StringIO

import gzip

import re

import chardet

import sys

import os

import datetime

from xml.dom.minidom import Document

from BeautifulSoup import BeautifulSoup

## 这段代码是用于解决控制台打印汉字报错的问题

reload(sys)

sys.setdefaultencoding("utf8")

#####################################################

## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志

DEBUG = 1

NO_DEBUG = 0

httplib.HTTPConnection.debuglevel = DEBUG

## 是否显示爬取网页源代码开关

showSrcCode = False

## 压缩方式

ZIP_TYPE = "gzip"

fileName = "auctions"

location = "d://spiderData/"

## header

headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE}

#####################################################

#############class SpiderConfig #####################

class SpiderConfig:

"""

configuration for spider name and url

"""

def __init__(self, name, url):

self.name = name

self.url = url

#####################################################

##############class SpiderAuctionDomain##############

class SpiderAuctionDomain:

"""

Store information with auctions spidered by python

"""

title = ""

url = ""

img = ""

price = ""

def __init__(self):

pass

#####################################################

########class SpiderDefaultErrorHandler##############

class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):

def http_error_default(self, req, fp, code, msg, hdrs):

"""

default error process handler for spider

"""

result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)

result.status = code

result.url = req.get_full_url()

print ""

return result

#####################################################

#############class SpiderHandler#####################

class SpiderHandler:

"""

spider handler

"""

def spider(self, spiderConfig):

try:

request = urllib2.Request(spiderConfig.url)

## configure request hreader

for key,val in headerConfig.items():

request.add_header(key, val)

## build opener

opener = urllib2.build_opener(SpiderDefaultErrorHandler())

## open request

openRequest = opener.open(request)

## read data

spiderData = openRequest.read()

## close

opener.close()

if 0 == len(spiderData):

return

if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):

spiderData = SpiderHandler.gzipData(self, spiderData)

if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:

print spiderData

# parse html

SpiderHandler.parse(self, spiderData)

except Exception,x:

print "spider process Exception:", x

def parse(self, spiderData):

"""

parse html content

"""

if httplib.HTTPConnection.debuglevel == DEBUG:

charsetAnalyze = chardet.detect(spiderData)

print "analyze spider data encode :",charsetAnalyze["encoding"]

print "执行解析", fileName

soup = BeautifulSoup(spiderData)

encode = soup.originalEncoding

encoding = lambda x : x.encode(encode)

if httplib.HTTPConnection.debuglevel == DEBUG:

print "识别到编码:", encode

title = soup.head.title.string

print encoding(title)

spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"})

auctions = ["%s" % s for s in spiderContents]

if auctions is None:

return

auctionList = []

for auc in auctions:

auctionDomain = SpiderAuctionDomain()

# parse auction link

links = re.search(re.compile(r']*)[\"|\']', re.IGNORECASE), auc)

if links is not None :

auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))

#parse auction title

titles = re.search(re.compile(r"([^>]*)", re.IGNORECASE), auc)

if titles is not None:

auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))

#parse auction price

price = re.search(re.compile(r"([^", re.IGNORECASE), auc)

if price is not None:

auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])

#parse image url

imgs = re.search(re.compile(r"

%5B%5C'%5C%22%5D(%5B%5E]*)[\'\"]", re.IGNORECASE), auc)

if imgs is not None:

auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])

auctionList.append(auctionDomain)

print "成功解析商品信息:"

for a in auctionList:

print "--->",a.title

# sort auction list

auctionList = SpiderHandler.sortAuctionList(self, auctionList)

# save in file

SpiderHandler.save(self, auctionList)

print "解析完成"

pass

def sortAuctionList(self, auctionList):

"""

冒泡排序,按照价格排序

"""

length = len(auctionList)

if length < 2:

return auctionList

else:

for i in range(length-1):

for j in range(length - i -1):

if float(auctionList[j].price) > float(auctionList[j+1].price):

auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]

return auctionList

pass

def save(self, auctionList):

if auctionList is not None:

doc = Document()

auctions = doc.createElement("auctions")

doc.appendChild(auctions)

for auc in auctionList:

auction = doc.createElement("auction")

auctions.appendChild(auction)

SpiderHandler.generateXML(self, doc, auction, "title", auc.title)

SpiderHandler.generateXML(self, doc, auction, "price", auc.price)

SpiderHandler.generateXML(self, doc, auction, "img", auc.img)

SpiderHandler.generateXML(self, doc, auction, "link", auc.link)

if False == os.path.exists(location):

os.mkdir(location)

file = open(location+fileName+".xml", 'w')

file.write(doc.toprettyxml())

file.close()

if httplib.HTTPConnection.debuglevel == DEBUG:

print doc.toprettyxml()

def generateXML(self, doc, f, name, txt):

c = doc.createElement(name)

f.appendChild(c)

c.appendChild(doc.createTextNode(txt))

def gzipData(self, spiderData):

"""

get data from gzip

"""

if 0 == len(spiderData):

return spiderData

spiderDataStream = StringIO.StringIO(spiderData)

spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()

return spiderData

#####################################################

if __name__ == "__main__":

nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")

needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",

"hangzhou":"http://ju.taobao.com/hangzhou",

"shanghai":"http://ju.taobao.com/shanghai",

"beijing":"http://ju.taobao.com/beijing",

"chengdu":"http://ju.taobao.com/chengdu"}

configList = []

for k,v in needSpiderUrl.items():

spiderConfig = SpiderConfig(k, v)

configList.append(spiderConfig)

spiderHandler = SpiderHandler()

print "爬虫执行开始时间:",nowtime()

for spiderConfig in configList:

fileName = spiderConfig.name

spiderHandler.spider(spiderConfig)

print "爬虫执行完毕时间:",nowtime() 更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持中文源码网。

亲,试试微信扫码分享本页! *^_^*

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值