本次实战采取爬取糗事百科的段子
首先找到糗事百科的网址:http://www.qiushibaike.com/
我们这次打算爬取文字模块。其网址部分是http://www.qiushibaike.com/text/
然后去找到总共有多少页。打开审查元素,选择左上角的箭头,选择网页上的元素
可以看到
我们所需要的元素都在li标签下的span内,这对我们之后写正则表达式起到关键的作用。
再仔细观察每页的网址,如第2页, 第8页,第35页等,可以发现一个规律
即所有页面的规律符合http://www.qiushibaike.com/text/page/X/ (对第一页同样成立)
结合以上所有的信息,我们可以编写出我们的Python 爬虫
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import requests
import random
# ----------- 处理页面上的各种标签 -----------
class HTML_Tool:
# 用非 贪婪模式 匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片
BgnCharToNoneRex = re.compile("(\t|\n| |<a.*?>|<img.*?>)")
# 用非 贪婪模式 匹配 任意<>标签
EndCharToNoneRex = re.compile("<.*?>")
# 用非 贪婪模式 匹配 任意<p>标签
BgnPartRex = re.compile("<p.*?>")
CharToNewLineRex = re.compile("(<br/>|</p>|<tr>|<div>|</div>)")
CharToNextTabRex = re.compile("<td>")
# 将一些html的符号实体转变为原始符号
replaceTab = [("<", "<"), (">", ">"), ("&", "&"), ("&", "\""), (" ", " ")]
def Replace_Char(self, x):
x = self.BgnCharToNoneRex.sub("", x)
x = self.BgnPartRex.sub("\n ", x)
x = self.CharToNewLineRex.sub("\n", x)
x = self.CharToNextTabRex.sub("\t", x)
x = self.EndCharToNoneRex.sub("", x)
for t in self.replaceTab:
x = x.replace(t[0], t[1])
return x
class Qiubai_Spider:
# 申明相关的属性
def __init__(self, url):
self.user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
self.iplist=[]
response = requests.get('http://haoip.cc/index/2061578.htm')
ips = BeautifulSoup(response.text, 'lxml').find_all('tr')
# print ips
for ip in ips:
# print ip.find('td').get_text().strip()
self.iplist.append(ip.find('td').get_text().strip())
UA=random.choice(self.user_agent_list)
print UA
header={'User-Agent':UA}
html=requests.get(url,header)
contents=BeautifulSoup(html.text,'lxml').find_all('span',class_="page-numbers")[-1].get_text()
print contents
for i in range(1,int(contents)+1):
self.myUrl = url +'page/'+str(i)
print self.myUrl
self.datas = []
self.myTool = HTML_Tool()
self.find_title(self.myUrl)
print u'已经启动糗事百科爬虫,咔嚓咔嚓'
def find_title(self, myPage):
# 匹配 <h1 class="core_title_txt" title="">xxxxxxxxxx</h1> 找出标题
# myMatch = re.search(r'<h3.*?>(.*?)</h3>', myPage, re.S)
page_html=requests.get(myPage)
contents=BeautifulSoup(page_html.text,'lxml').find_all('div',class_="content")
for content in contents:
# pattern=re.compile('<span>(.*?)</span>',re.S)
# items=re.findall(pattern,content)
# for item in items:
# title = content.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"','').replace(
# '>', '').replace('<', '').replace('|', '')
# print content
text=content.find('span').get_text().replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"','').replace(
'>', '').replace('<', '').replace('|', '')
print (text+'\n\n')
# return title
# -------- 程序入口处 ------------------
print u"""#---------------------------------------
# 程序:糗事百科爬虫
"""
bdurl = 'http://www.qiushibaike.com/text/'
mySpider = Qiubai_Spider(bdurl)
结果为
当然还有很大的完善空间,比如或保存,以及代理设定,timeout等,不过现在得去上课(哎),以后会完善