#-*-conding:utf-8-*-
import urllib.request
import re
import bs4
#入口url
import time
url_mian = 'http://tieba.baidu.com/f?kw=%E5%89%91%E7%BD%913&fr=index&fp=0&ie=utf-8&red_tag=q3464037905'
#下载网页
def download(url,num_retries = 2,user_agent='wswp'):
print("Downloading:",url)
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
reheaders = urllib.request.Request(url,headers=headers)
#判断url是否可访问
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except UnicodeDecodeError:
# html = urllib.request.urlopen(url).read()
return None
# Urls.remove(url)
except urllib.request.URLError as e:
print('Downloading error',e.reason)
html = None
#判断是否5xx码错误
if num_retries>0:
if hasattr((e,'code')) and 500 <=e.code< 600:
return download(url,num_retries-1)
except:
print('asdddddddddddddddddddd')
print('网页下载完成')
# print(html)
return html
#拿到当前网页的标题和url地址 第一个页面所有的数据
def getUrl(url):
content = download(url)
if content != None:
print('Download is success')
reg = r'a href=(.*?) class="j_th_tit "'
ref = re.compile(reg)
cont = re.findall(ref,content)
for i in cont:
splicContent(i)
print(Urls)
url = findNext(url)
if url != None:
getUrl(url)
else:
return
else:
Urls.remove(url)
Urls=[]
#拆分拿到的内容
def splicContent(i):
print('开始拆分......')
list = i.split(' ')
titles = list[1]
if titles.find('818')!= -1 or titles.find('八一八')!= -1 or titles.find('树洞')!= -1 or titles.find('回忆')!=-1:
titles = titles[6:]
ends = len(list[0])
url = 'https://tieba.baidu.com' + list[0][1:ends-1] + '?see_lz=1'
#拿到具体的url
print('开始创建文件.....')
lookInUrl(url)#*******开始爬取内容
if url in Urls:
pass
else:
Urls.append(url)
return Urls
# print (Urls)#'https://tieba.baidu.com/p/5013522336"'
#查找是否含有下一页,如果有则返回下一页的url
def findNext(url):
html = download(url)
if html != None:
reg = r'a href=.* class="next pagination-item "'
ref = re.compile(reg)
cont = re.findall(ref,html)
if len(cont) != 0:
#获取下一页的url
list = cont[0].split(' ')
ends= len(list[1])
nextUrl =(list[1][6:ends-1])
return (nextUrl)
else:
print('已经没有下一页')
return None
else :
return None
Contents = []
title = ""
#下载具体页面
def lookInUrl(url):
global title
html = download(url)
if html != None:
# <div id="post_content_105016433767" class="d_post_content j_d_post_content ">
#判断是否是第一页
reup = r'a href=".*?">上一页'
refup = re.compile(reup)
isFirst = re.findall(refup, html)
if len(isFirst) == 0:
# <h3 class="core_title_txt pull-left text-overflow "
# title="树洞,昨天打了个3v3,然后我被奶秀亲友拉黑了"
# style="width: 396px">树洞,昨天打了个3v3,然后我被奶秀亲友拉黑了</h3>
# < h3 class ="core_title_txt pull-left text-overflow vip_red " title="【年度818】远程跳蛋好玩吗?" style="width: 396px" > 【年度818】远程跳蛋好玩吗? < / h3 >
# print(html)
#<h3 class="core_title_txt pull-left text-overflow vip_red " title="【年度818】远程跳蛋好玩吗?" style="width: 396px">【年度818】远程跳蛋好玩吗?</h3>
try:
reg = r'h3 class="core_title_txt pull-left text-overflow " title="(.*?)</h3>'
# reg = r'h3 class="core_title_txt pull-left text-overflow vip_red " title=".*?"'
ref = re.compile(reg)
title = re.findall(ref, html)
print(title)
title = (title[0]).split('>')[1]
print(title) # ************************************标题准备Ok
except:
reg = r'h3 class="core_title_txt pull-left text-overflow vip_red " title=".*?"'
ref = re.compile(reg)
title = re.findall(ref, html)
title = (title[0].split(' ')[8][7:-1])
print(title)
# 抓取内容
try:
regg = r'<div id=".*" class="d_post_content j_d_post_content ">(.*?)</div>'
reff = re.compile(regg)
content = re.findall(reff,html)
print(content)
except:
print('内容获取时异常')
Excepts(url)
# print(len(content))
# for i in content:
# print(i)
Contents.append(content)
allContents = getContent(html)#*****************拿到帖子内所有信息
writer( title,Contents)
else:
return None
def Excepts(url):
print(url)
content = urllib.request.urlopen(url)
soup = bs4.BeautifulSoup(content, 'html.parser')
soup.original_encoding
html = soup
# print(html)
content = soup.find_all('div', {'class': 'd_post_content j_d_post_content '})
print(content)
Contents.append(content)
getContent(html)
def getAllExcepts(url):
content = urllib.request.urlopen(url)
soup = bs4.BeautifulSoup(content, 'html.parser')
soup.original_encoding
html = soup
Next = soup.find_all('li',{'class':'l_pager pager_theme_5 pb_list_pager'})
for i in Next:
if i.find('下一页'):
print(i)
#拿到所有内容
def getContent(html):
#<a href="/p/5013807180?see_lz=1&pn=2">下一页</a>
regg = r'a href=".*?">下一页'
reff = re.compile(regg)
lls = re.findall(reff,html)
if len(lls) != 0:
newurl = 'https://tieba.baidu.com'+lls[0][8:-5]
lookInUrl(newurl)
else:
# for i in Contents:
# for n in i:
# print(' '+n+'\n\n\n')
return Contents
def writer(title,content):
# print(title)
try:
f = open('E:/J3/'+title+'.html','w',encoding='utf-8')
f.write('<!DOCTYPE html>')
f.write('<html>')
f.write('<head>')
f.write('<meta charset="UTF-8">')
f.write('<title>%s</title>'%title)
f.write('</head>')
f.write('<body>')
f.write('<table>')
print('开始写入表格')
for i in content:
# print(type(i))
for n in i:
# print(type(n))
# print(n)
f.write('<tr>%s</tr>'%n)
f.write('</table>')
f.write('</body>')
f.write('</html>')
print('写入完成')
time.sleep(2)
except:
return
# urlssss='https://tieba.baidu.com/p/4509181593?see_lz=1'
# print(lookInUrl(urlssss))
(getUrl(url_mian))
Python 才学没几天 刚看到爬虫就动手写了一个 都在一个类中QwQ 以后会重构的