- urllib库是python内置的,无需我们额外安装,只要安装了Python就可以使用这个库。
- requests库是第三方库,需要我们自己安装。
requests库强大好用,所以本文使用requests库获取网页的HTML信息。requests库的github地址:https://github.com/requests/requests
requests安装
# -*- coding:UTF-8 -*-
import requests
if __name__ == '__main__':
target = 'www.4399.com'
req = requests.get(url=target)
print(req.text)
选取特定标签内容
Beautiful Soup
一个强大的第三方库,都会有一个详细的官方文档。我们很幸运,Beautiful Soup也是有中文的官方文档。
URL:http://beautifulsoup.readthedocs.io/zh_CN/latest/
现在我们要选取下面标签内的内容:
<div class= 'mi_d'>
.....
</div>
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
target = 'http://www.4399.com'
req = requests.get(url = target)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'mi_d')
file = open("测试文档.txt","w",encoding="gbk")
file.write(str(texts))
file.close()
print(texts)
结果:
[<div class="mi_d"><span><a class="fb6" href="/pcgame/">热游推荐</a></span><span><a href="/special/295.htm">互动阅读</a></span><span><a href="/flash/">最新游戏</a></span><span><a class="fb6" href="/special/1.htm"><b>双人小游戏</b></a></span><span><a href="/special/148.htm">无敌版</a></span><span><a class="fb6" href="/special/219.htm">单人</a></span><span><a href="/special/241.htm">冰火人</a></span><span><a class="fb6" href="/special/225.htm">三人</a></span></div>, <div class="mi_d"><span><a href="/special/232.htm">填颜色</a></span><span><a class="fb6" href="http://www.4399er.com/"><b>儿歌故事</b></a></span><span><a href="/special/286.htm">小马宝莉</a></span><span><a href="/special/17.htm">海绵宝宝</a></span><span><a href="/special/182.htm">朵拉</a></span><span><a href="/special/202.htm">学习</a></span><span><a href="/special/252.htm">玩具</a></span><span><a href="/special/269.htm">托马斯</a></span></div>, <div class="mi_d"><span><a href="/special/13.htm">格斗</a></span><span><a href="/special/242.htm">乐高</a></span><span><a href="/special/270.htm">男生游戏</a></span><span><a href="/special/33.htm">三国</a></span><span><a class="fb6" href="/special/258.htm">功夫</a></span><span><a href="/special/95.htm">变形金刚</a></span><span><a href="/special/136.htm">火柴人</a></span><span><a href="/special/224.htm">西游记</a></span><span><a href="/special/235.htm">机器人</a></span></div>, <div class="mi_d"><span><a href="/special/238.htm">穿越防线</a></span><span><a href="/special/208.htm">狙击手</a></span><span><a href="/special/10.htm">反恐精英</a></span><span><a href="/special/126.htm">大炮</a></span><span><a href="/special/16.htm">战斗机</a></span><span><a href="/special/128.htm">特种兵</a></span><span><a href="/special/29.htm">坦克</a></span><span><a href="/special/121.htm">枪战特警</a></span></div>, <div class="mi_d"><span><a class="fb6" href="/special/8.htm"><b>连连看</b></a></span><span><a href="/special/212.htm">解谜</a></span><span><a href="/special/173.htm">拼图</a></span><span><a href="/special/63.htm">迷宫</a></span><span><a href="/special/98.htm">大富翁</a></span><span><a href="/special/60.htm">找茬</a></span><span><a class="fb6" href="/special/59.htm">塔防</a></span><span><a href="/special/168.htm">找东西</a></span><span><a href="/special/87.htm">麻将</a></span><span><a href="/special/218.htm">小火车</a></span></div>, <div class="mi_d"><span><a class="fb6" href="/special/143.htm">斗地主</a></span><span><a href="/special/66.htm">祖玛</a></span><span><a href="/special/34.htm">黄金矿工</a></span><span><a href="/special/64.htm">消消看</a></span><span><a href="/special/79.htm">吃豆豆</a></span><span><a href="/special/62.htm">泡泡龙</a></span><span><a href="/special/125.htm">直升机</a></span><span><a href="/special/152.htm">农场</a></span><span><a href="/special/293.htm">组装游戏</a></span></div>, <div class="mi_d"><span><a href="/special/28.htm">赛车</a></span><span><a href="/special/96.htm">卡丁车</a></span><span><a href="/special/164.htm">越野车</a></span><span><a href="/special/37.htm">自行车</a></span><span><a href="/special/167.htm">汽车</a></span><span><a href="/special/100.htm">停车</a></span><span><a href="/special/35.htm">篮球</a></span><span><a class="fb6" href="/special/38.htm"><b>足球</b></a></span><span><a href="/special/118.htm">过山车</a></span><span><a href="/special/31.htm">台球</a></span></div>, <div class="mi_d"><span><a href="/special/172.htm">中文精选</a></span><span><a class="fb6" href="/special/93.htm">大鱼吃小鱼</a></span><span><a href="/special/277.htm">动漫游戏</a></span><span><a href="/special/89.htm">冒险岛</a></span><span><a href="/special/120.htm">忍者</a></span><span><a href="/special/259.htm">跑酷</a></span><span><a href="/special/124.htm">战士</a></span><span><a class="fb6" href="/special/90.htm">闯关</a></span><span><a href="/special/260.htm">3D游戏</a></span></div>]
剔除不需要的标签或字表符
find_all匹配的返回的结果是一个列表。
提取匹配结果后,使用text属性,提取文本内容,滤除br标签。随后使用replace方法,剔除空格,替换为回车进行分段。 在html中是用来表示空格的。replace(’\xa0’*8,’\n\n’)
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
target = 'http://www.4399.com'
req = requests.get(url = target)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'mi_d')
content = texts[0].text.replace('\xa0'*8,'\n\n')
file = open("测试文档.txt","w",encoding="gbk")
file.write(str(content))
file.close()
print(texts)
结果:
热游推荐互动阅读最新游戏双人小游戏无敌版单人冰火人三人
多次调用Beautiful Soup不断剔除
req = requests.get(url = target) html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'listmain')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
获取标签的链接
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
server = 'http://www.biqukan.com/'
target = 'http://www.biqukan.com/1_1094/'
req = requests.get(url = target) html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'listmain')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
for each in a:
print(each.string, server + each.get('href'))
整合代码
www.biqukan.com例子
# -*- coding:UTF-8 -*-
import os
from bs4 import BeautifulSoup
import requests, sys
"""
类说明:下载《笔趣看》网小说《蛊真人》
Parameters:
无
Returns:
无
Modify:
2017-09-13
"""
class downloader(object):
def __init__(self):
self.server = 'http://www.biqukan.com/'
self.target = 'http://www.biqukan.com/2_2887/'
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2017-09-13
"""
def get_download_url(self):
os.environ['NO_PROXY'] = 'biqukan.com'
req = requests.get(url = self.target) #获取整个网页
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'listmain')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数
for each in a[15:]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2017-09-13
"""
def get_contents(self, target):
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'showtxt')
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2017-09-13
"""
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
print('《蛊真人》开始下载:')
for i in range(dl.nums):
dl.writer(dl.names[i], '蛊真人.txt', dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + '\r')
sys.stdout.flush()
print('《蛊真人》下载完成')
www.51shucheng.net的例子
# -*- coding:UTF-8 -*-
import os
from bs4 import BeautifulSoup
import requests, sys
"""
类说明:下载《无忧书城》网小说《我的天才女友》
Parameters:
无
Returns:
无
Modify:
2021-02-3
"""
class downloader(object):
def __init__(self):
self.server = 'https://www.51shucheng.net/'
self.target = 'https://www.51shucheng.net/waiguo/wodetiancainvyou/'
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2017-09-13
"""
def get_download_url(self):
os.environ['NO_PROXY'] = '51shucheng.net'
req = requests.get(url = self.target) #获取整个网页
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'mulu-list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数
for each in a[15:]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2017-09-13
"""
def get_contents(self, target):
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'neirong')
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2017-09-13
"""
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
print('《我的天才女友》开始下载:')
for i in range(dl.nums):
dl.writer(dl.names[i], '我的天才女友.txt', dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + '\r')
sys.stdout.flush()
print('《我的天才女友》下载完成')