前言:一直想爬点东西,最近开始学习用python爬页面,先从自己的博客页面开始吧。
基础:
python3.6
模块:BeautifulSoup(安装要用pip install beautifulsoup4,这个坑了我一下)、randow、xlwt(写入excel)
需求:
预计获取所有的文章名称、描述、创建时间、阅读人数、评论人数
开始
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
import random
import xlwt
import re
# 随机获取agent,避免被封。虽然这里没什么必要···
def getAgent():
agents = [
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
]
return random.choice(agents)
请求数据
def requestCSDN():
print('请求数据')
url = "http://blog.csdn.net/qq_24142325";
values = {
'data':'viewmode=list'#可以带各种参数
}
result = getHtml(url, values)
return result
def getHtml(url, values):
headers = {'User-Agent':getAgent()}
data = urllib.parse.urlencode(values)
response_result = urllib.request.urlopen(url+'?'+data).read()#组装好url,开始请求页面
html = response_result.decode('utf-8')
return html
处理页面
先列出所需数据,即下面的articlesHtml数据
<div class="list_item article_item">
<div class="article_title">
<span class="ico ico_type_Original"></span>
<h1>
<span class="link_title">
<a href="/qq_24142325/article/details/75008179">
【读书笔记】模拟对话,理解TCP的三次握手与四次挥手
</a>
</span>
</h1>
</div>
<div class="article_description">
前言:看到一篇博文图解 TCP 三次握手与四次分手,记录自己的理解。
三次握手:一:client:喂喂喂?server你听的到吗?
第一次握手:# 测试自己的发信能力和对方的收信能力
二:server:恩恩,我听的到你,你听的到我吗?
第二次握手:#证明对方的发信能力和自己的收信能力,测试自己的发信能力和对方的收信能力
三:client:我也能听到你啊,那我们就开始对话吧!
第三次握手:...
</div>
<div class="article_manage">
<span class="link_postdate">2017-07-12 12:04</span>
<span class="link_view" title="阅读次数"><a href="/qq_24142325/article/details/75008179" title="阅读次数">阅读</a>(18)</span>
<span class="link_comments" title="评论次数"><a href="/qq_24142325/article/details/75008179#comments" title="评论次数" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_pinglun'])">评论</a>(0)</span>
</div>
<div class="clear"></div>
</div>
主体部分:
#方法目的:1.获取页面数据中所需部分,然后写入excel中
def delResult(result):
soup = BeautifulSoup(result, 'html.parser')
title = soup.find(id='blog_title').h2.a.string#find只获取一个
articles = []
articlesHtml = soup.find_all(class_="list_item article_item")#获取数据上面已经列出
for art in articlesHtml:
article = {'name':'', 'des':'', 'time':'', 'read':'', 'note':''}#利用字典存储数据
article['name'] = trimSpaceAndLineBread(art.find(class_="article_title").h1.span.a.string)#消除空格和换行符
article['des'] = trimSpaceAndLineBread(art.find(class_="article_description").string)
article['time'] = trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_postdate").string)
article['read'] = txt_wrap_by('(', ')', trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_view").text))#text的值为“阅读(18)”这种,我们需要的只是“()”中的‘18’
article['note'] = txt_wrap_by('(', ')', trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_comments").text))
articles.append(article)#加进articles中
result = {'title':title, 'articles':articles}
saveExl('index.xls', result)#存进excel表格中
#消除空格和换行符,验证none,不然万一str是none会报错
def trimSpaceAndLineBread(str):
if str != None:
str = str.replace('\r','').replace('\n','').replace(' ','')
else:
str = str
return str
#获取两个字符串之间的值,即“阅读(18)”,“()”中的18
def txt_wrap_by(start_str, end, html):
start = html.find(start_str)
if start >= 0:
start += len(start_str)
end = html.find(end, start)
if end >= 0:
return html[start:end].strip()
#写入excel
def saveExl(path, result):
try:
workbook = xlwt.Workbook()#Workbook 的W必须大写
sheet1 = workbook.add_sheet(result['title'], cell_overwrite_ok=True)
c = 0;
sheet1.write(c, 0, '文章名')
sheet1.write(c, 1, '文章描述')
sheet1.write(c, 2, '创建时间')
sheet1.write(c, 3, '阅读次数')
sheet1.write(c, 4, '评论次数')
for f in result['articles']:#循环写入
c = c + 1
sheet1.write(c, 0, f['name'])
sheet1.write(c, 1, f['des'])
sheet1.write(c, 2, f['time'])
sheet1.write(c, 3, f['read'])
sheet1.write(c, 4, f['note'])
print('正在写入第' + str(c) + '行')
workbook.save(path)
print('写入成功')
except IOError:
print('写入出错')
调用
delResult(requestCSDN())
结果
最后
基本上,就是最简单的一个爬虫了,刚开始做,还有很多可以优化的地方,慢慢来吧。
附上自己的完整代码
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
import random
import xlwt
import re
def getAgent():
agents = [
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
]
return random.choice(agents)
def getHtml(url, values):
headers = {'User-Agent':getAgent()}
data = urllib.parse.urlencode(values)
response_result = urllib.request.urlopen(url+'?'+data).read()
html = response_result.decode('utf-8')
return html
def requestCSDN():
print('请求数据')
url = "http://blog.csdn.net/qq_24142325";
values = {
'data':'viewmode=list'
}
result = getHtml(url, values)
return result
def saveResult(path, result):
try:
with open(path, 'w') as f:
f.write(result)
except IOError:
print('写入出错')
def saveExl(path, result):
try:
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet(result['title'], cell_overwrite_ok=True)
c = 0;
sheet1.write(c, 0, '文章名')
sheet1.write(c, 1, '文章描述')
sheet1.write(c, 2, '创建时间')
sheet1.write(c, 3, '阅读次数')
sheet1.write(c, 4, '评论次数')
for f in result['articles']:
c = c + 1
sheet1.write(c, 0, f['name'])
sheet1.write(c, 1, f['des'])
sheet1.write(c, 2, f['time'])
sheet1.write(c, 3, f['read'])
sheet1.write(c, 4, f['note'])
print('正在写入第' + str(c) + '行')
workbook.save(path)
print('写入成功')
except IOError:
print('写入出错')
def delResult(result):
soup = BeautifulSoup(result, 'html.parser')
title = soup.find(id='blog_title').h2.a.string
articles = []
articlesHtml = soup.find_all(class_="list_item article_item")
for art in articlesHtml:
article = {'name':'', 'des':'', 'time':'', 'read':'', 'note':''}
article['name'] = trimSpaceAndLineBread(art.find(class_="article_title").h1.span.a.string)
article['des'] = trimSpaceAndLineBread(art.find(class_="article_description").string)
article['time'] = trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_postdate").string)
article['read'] = txt_wrap_by('(', ')', trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_view").text))
article['note'] = txt_wrap_by('(', ')', trimSpaceAndLineBread(art.find(class_="article_manage").find(class_="link_comments").text))
articles.append(article)
result = {'title':title, 'articles':articles}
saveExl('index.xls', result)
def trimSpaceAndLineBread(str):
if str != None:
str = str.replace('\r','').replace('\n','').replace(' ','')
else:
str = str
return str
def txt_wrap_by(start_str, end, html):
start = html.find(start_str)
if start >= 0:
start += len(start_str)
end = html.find(end, start)
if end >= 0:
return html[start:end].strip()
def main():
delResult(requestCSDN())
if __name__ == '__main__':
main()
最后的最后
唉,早知道不爬评论了,全是0,真心酸