抓取百度贴吧
目标:
1、获取帖子标题、总页数、评论、图片
2、图片写入文件并保存
3、将各种信息实现打印(测试追踪)
4、输入帖子号便能实现以上操作(亦适用于其它帖子)
第一版:
# -*-coding:utf-8-*-
import random
import re
import os
import urllib
import requests
import urllib.request
import time
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
"""初始化查询的网址"""
siteURL = "http://tieba.baidu.com/p/"
def replace(x):
"""
方便用replace方法把换行符等删除
:param: x
:return: x.strip
"""
# 将list 转化为字符串 否则报错expected string or bytes-like object
x = ''.join(x)
removeImg = re.compile('|{7}| ') # 去除img标签,1-7位空格,
removeAddr = re.compile('|') # 删除超链接标签
replaceLine = re.compile('
|replaceTD = re.compile('
') # 把表格制表换为\treplaceBR = re.compile('
|
||') # 把换行符或者双换行符换为\n
removeExtraTag = re.compile('.*?') # 把其余标签剔除
removeNoneLine = re.compile('\n+') # 把多余空行删除
removeNoneLine = re.compile('\n+') # 把多余空行删除
x = re.sub(removeImg, "", x)
x = re.sub(removeAddr, "", x)
x = re.sub(replaceLine, "\n", x)
x = re.sub(replaceTD, "\t", x)
x = re.sub(replaceBR, "\n", x)
x = re.sub(removeExtraTag, "", x)
x = re.sub(removeNoneLine, "\n", x)
return x.strip() # 把strip()前后多余内容删除
def getSource(url):
"""
获取网页源码
:param: url
:return: result
"""
user_agents = [
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)'
]
length = len(user_agents) - 1
print(length)
index = random.randint(0, length)
user_agent = user_agents[index]
headers = {'User_agent': user_agent}
r = requests.get(url, headers=headers)
return r.text
def saveImage(imageURL, path, title, name, pageName):
"""
保存图片写入文件
:param: imageURL, path, title, name, pageName
:return:
"""
try:
# 命名格式有问题 不能用网址命名 因为有'/' 命名格式不行 下面在做一个过滤器 下周一来搞
# 解决方案 在调用前传入名称就好
proDir = os.path.split(os.path.realpath(__file__))[0]
fileName = name + '.' + 'jpg'
filePath = os.path.join(proDir, "photo", title, path, pageName)
urllib.request.urlretrieve(imageURL, filePath + fileName)
# urllib.request.urlretrieve(imageURL, filePath)
# urllib.request.urlretrieve(imageURL, filePath + '\\%s.jpg' % imageURL)
except Exception as e:
print(e)
def getTitle(url):
"""
获取帖子的标题,并打印输出
:param: url
:return: iteam
"""
result = getSource(url)
pattern = re.compile('
(.*?)', re.S)iteam = re.findall(pattern, result)
text = replace(iteam)
print(u'这篇文章的标题为------' + text)
return text
def getPageNumber(url):
"""
获取该帖子的总页数,并打印输出
:param: url
:return:iteams
"""
result = getSource(url)
soup = BeautifulSoup(result, 'lxml')
# pattern = re.compile('