python爬取百度贴吧图片库_python爬取百度贴吧的图片2

今天看了一下beautifulsoup库的用法,把昨天的python爬取百度贴吧的图片1的代码更新成使用beautifulsoup库的函数来实现。用的还是不太熟练,但是感觉比正则表达式写起来容易了一些。

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

import urllib2

import urllib

import re

class imgTest:

def __init__(self, baseUrl, seeLZ):

self.baseUrl = baseUrl

self.seeLZ = '?see_lz='+str(seeLZ)

#print to log.txt

def printToLog(self,mystr):

f = open('txt/log.txt', 'a')

f.write(mystr+"\n")

f.close()

#get the html source code

def getPage(self, pageNum):

try:

url = self.baseUrl+self.seeLZ +'&pn='+str(pageNum)

request = urllib2.Request(url)

response = urllib2.urlopen(request)

content = response.read()

return content

except urllib2.URLError, e:

if hasattr(e, "reason"):

print "failed to connect baidutieba.",e.reason

return None

def getPageNum(self):

page = self.getPage(1)

soup = BeautifulSoup(page,'html.parser')

pageNum = soup.find_all("span",class_='red')[1].string

return pageNum

def getTitle(self):

page = self.getPage(1)

soup = BeautifulSoup(page,'html.parser')

return soup.h3.string

def getAllImageURLs(self,pageNum):

page = self.getPage(pageNum)

soup = BeautifulSoup(page,'html.parser')

imgTags = soup.find_all("img",class_="BDE_Image")

imgURLs = []

for item in imgTags:

imgURLs.append(item.get('src'))

print imgURLs

return imgURLs

#save a single img

def saveImg(self,imageURL,filename):

u = urllib.urlopen(imageURL)

data = u.read()

f = open(filename,'wb')

f.write(data)

f.close()

#download images

def saveImgs(self, images, name, num):

number = num

for imageURL in images:

splitPath = imageURL.split('.')

fTail = splitPath.pop()

if len(fTail)>3:

fTail = "jpg"

fileName = name+"/"+str(number)+"."+fTail

self.saveImg(imageURL,fileName)

number += 1

baseURL = 'http://tieba.baidu.com/p/3925387672'

imgtest = imgTest(baseURL,1)

totalnum = int(imgtest.getPageNum())

imageCount = 0

for i in range(1, totalnum+1):

imageURLs = imgtest.getAllImageURLs(i)

imgtest.saveImgs(imageURLs,"pic",imageCount)

imageCount += len(imageURLs)

print imageCount

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值