使用xml爬取百度贴吧的图片
import urllib2
import urllib
from lxml import etree
import os
class TiebaSpider:
def tiebaSpider(self , url , beginPage , endPage):
for page in range(beginPage , endPge + 1):
pn = (page - 1) * 50
fullurll = url + "&pn=" + str(pn)
self.loadPage(fullurl)
print "谢谢使用"
def loadPage(self , url):
headers = {"User-Agent" :"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6. 1;Trident/5.0"}
request = urllib2.Request(url , headers = headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for link in link_list:
fulllink = "http://tieba.baidu.com" + link
self.loadImage(fulllink)
def loadImage(self , link):
headers = {"User-Agent" :"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6. 1;Trident/5.0"}
request = urllib2.Request(link , headers = headers)
response = urllib2.urlopen(request)
html = response.read()
content = etree.HTML(html)
link_list = content.xpath('//div/img[@class="BDE_Image"]/@src')
for link in link_list:
self.writeImage(link)
def writeImage(self , link):
headers = {"User-Agent" :"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6. 1;Trident/5.0"}
request = urllib2.Request(link , headers = headers)
image = urllib2.urlopen(request).read()
filename = link[-10:]
dir = "pic"
if not os.path.exists(dir):
os.makedir(dir)
with open("pic/" + filename , "wb") as f :
f.write(image)
print "已经成功下载" + filename
if __name__ == "__main__":
kw = raw_input("请输入需要爬取的贴吧名:")
beginPage = int(raw_input("请输入起始页:"))
endPage = int(raw_input("请输入结束页:"))
url = "http://tieba.baidu.com/f?"
key = urllib.urlencode({"kw" : kw})
fullurl = url + key
tiebaSpider = TiebaSpider()
tiebaSpider.tiebaSpider(fullurl , beginPage , endPage)