再接再厉,再次使用python3学习编写了一个爬取百度贴吧帖子的程序,不多说,直接上关键代码
#抓取贴吧一个帖子上的内容(一页内容)
import urllib
import urllib.request
import re
page = 1
baseUrl = r'https://tieba.baidu.com/p/2687476192'
seeLZ = 0
try:
url = baseUrl+'?see_lz='+str(seeLZ)+'&pn='+str(page)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#获取帖子标题
patternTitle = re.compile(r'<h\d class="core_title_txt.*?>(.*?)</h\d>',re.S)
resultTitle = re.search(patternTitle,content)
print(resultTitle.group(1).strip())
#获取帖子回复数和总页数
patternNum = re.compile(r'<li class="l_reply_num".*?><span.*?>(.*?)</span.*?<span.*?>(.*?)</span>',re.S)
resultNum =re.search(patternNum,content)
print(resultNum.group(1).strip(),resultNum.group(2).strip())
#获取帖子每层楼内容
patternContent = re.compile(r'<div id="post_content_.*?">(.*?)</div>',re.S)
items = re.findall(patternContent,content)
tool = Tool()
for item in items:
print('\n',tool.replace(item),'\n')
except urllib.request.URLError as e:
if hasattr(e,'reason'):
print(e.reason)
完整代码也上传到了github: https://github.com/callMeBin2217/python3_Spider 有兴趣的朋友可以下载来看看,或者和我交流交流。小小小小小小白求轻喷