python爬虫
1. 爬取网页内容
a. 使用urllib库
# 导入模块
import urllib.request
headers = {
"User-Agent": "User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
}
def use_urllib(url=None, headers=None):
"""
对于一些有反爬措施的网站,需要利用headers模仿浏览器访问
:param url:网页地址
:param headers:用户代理
:return:返回网页html内容
"""
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
data = response.read().decode() # 对网页内容进行解码
print(data)
return data
b. 使用requests库
import requests
def use_requests(url, headers=None):
"""
:param url: 网页地址
:param headers:用户代理,一般可不用
:return:返回网页html内容
"""
response = requests.get(url=url, headers=headers)
print(response)
data = response.text
return data
# print(data)
c. 使用 selenium库调用浏览器
可以用Chrome浏览器和Firefox浏览器,需要下载各自的webdriver, 谷歌浏览器见chromedriver,需要下载对应的版本;火狐浏览器见Firefoxdriver。下载完成后需要解压到编译环境所在的python目录下。
使用如下:
import selenium.webdriver as sw
def use_Chrome():
# 选择浏览器为Chrome浏览器
driver = sw.Chrome()
# 传入需要访问的网址
driver.get(url)
# 将网页的内容赋值给data
data = driver.page_source
# 将data的内容保存到本地
with open(FILE_PATH, 'w', encoding='utf-8') as f:
f.write(data)
# 关闭浏览器
driver.close()
def use_Firefox():
driver = sw.Firefox()
driver.get(url)
data = driver.page_source
# 将网页内容保存到本地
with open(FILE_PATH, 'w', encoding='utf-8') as f:
f.write(data)
driver.close()
2. 爬取某读书网站的小说分类的前十页书
import requests
import selenium.webdriver as sw
from bs4 import BeautifulSoup
import csv
# 书名
book_name_list = []
# 作者
author_name_list = []
# 简介
abstract_list = []
for i in range(0, 10):
# 通过翻页获取网址的规律,写入外层循环
url = "https://www.dushu.com/book/1078_" + str(i + 1) + ".html"
response = requests.get(url)
data = response.text
# BeautifulSoup方法解析网页内容
soup = BeautifulSoup(data, "html.parser")
for i in range(40):
# 找到每页书名对应选择器的样式
book_name = soup.select("body > div:nth-child(6) > div > "
"div.span19.padding-left > div.bookslist "
"> ul > li:nth-child(" + str(
i + 1) + ") > div "
"> h3 > a")
author_name = soup.select("body > div:nth-child(6) > div > "
"div.span19.padding-left > "
"div.bookslist > ul > li:nth-child("
"" + str(
i + 1) + ") > div > p:nth-child(3)")
abstract = soup.select("body > div:nth-child(6) > div > "
"div.span19.padding-left > div.bookslist >"
" ul > li:nth-child(" + str(
i + 1) + ") > div > "
"p.disc.eps")
# 将解析得到的书名、作者、简介存到列表中
for book_name in book_name:
book_name_list.append(
book_name.get_text().replace("\n", "").replace(" ", ""))
# print(book_name.get_text().replace("\n", "").replace(" ", ""))
for author_name in author_name:
# print(author_name)
author_name_list.append(
author_name.get_text().replace("\n", "").replace(" ", ""))
for abstract in abstract:
abstract_list.append(
abstract.get_text().replace("\n", "").replace(" ", ""))
找相应样式,在网页右键检查元素->操作框左上角->选择一个元素检测->选择书名->右键复制selector
3. 将爬取的内容写入csv文件
def save_csv(path):
with open(path, 'w', newline='',encoding='utf_8_sig') as f:
# 定义表头名
fieldnames = ["书名", "作者", "简介"]
# 以字典形式写入
f_csv = csv.DictWriter(f, fieldnames)
# 写入表头
f_csv.writeheader()
for i in range(len(book_name_list)):
# 循环写入行,key值需要和表头值对应
f_csv.writerow({
"书名": book_name_list[i],
"作者": author_name_list[i],
"简介": abstract_list[i],
})
结果如下: