一、requests库基础知识
-
Requests的方法
-
requests库的response对象
二、爬取网站所需信息
1.访问网站,如图1-1所示:
图1-1
2.点击子页面,审查网页元素,部分内容如图1-2所示:
图1-2
3.实现代码如下:
#coding:utf-8
import requests
from bs4 import BeautifulSoup
import xlsxwriter
#定义网页内容获取函数GET_HTML_CONTENT
def GET_HTML_CONTENT(url):
#定义user_agent,模拟浏览器访问网页
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/63.0.3239.132 Safari/537.36'
headers = {'User-Agent':user_agent}
r = requests.get(url,headers=headers)
#获取网页内容
html_str = r.text
return html_str
#定义子网页URL获取函数GET_CHILD_URL
def GET_CHILD_URL(content):
data = BeautifulSoup(content, "html.parser")
genre_session = data.find_all('li', attrs={'class': "medium listbox group"})
#定义一个空列表childurl存放类别名称及子网页URL
childurl = []
for session in genre_session:
elements = session.find_all('h3', attrs={'class': "heading"})
for element in elements:
genre = {}
genre['name'] = element.find('a').text
genre['nextpage'] = element.find('a')['href']
childurl.append(genre)
return childurl
#定义子网页内容处理函数GET_CHILD_INFO
def GET_CHILD_INFO(content,kind):
data = BeautifulSoup(content, "html.parser")
book_session = data.find_all('ol', attrs={'class': "alphabet fandom index group "})
items = book_session[0].find_all('ul', attrs={'class': "tags index group"})
#定义一个空列表books存放书的类别、名称及评论数
books = []
for item in items:
book = {}
book['kinds'] = kind
book['name'] = item.find('a').text
book['reviews'] = item.text.strip().split('\n')[-1].strip().strip('()')
books.append(book)
return books
if __name__ == '__main__':
url = 'https://archiveofourown.org/media'
content = GET_HTML_CONTENT(url)
childurl = GET_CHILD_URL(content)
row = 1
col = 0
data = [[u'类别',u'名称',u'评论数']]
workbook = xlsxwriter.Workbook("data.xlsx")
worksheet = workbook.add_worksheet()
worksheet.write_row(0,0,data[0])
for k in childurl:
kind = k['name']
nexturl = k['nextpage']
geturl = 'https://archiveofourown.org' + nexturl
txt = GET_HTML_CONTENT(geturl)
books = GET_CHILD_INFO(txt,kind)
for info in books:
worksheet.write(row, col, info['kinds'])
worksheet.write(row, col + 1, info['name'])
worksheet.write(row, col + 2, info['reviews'])
row += 1
workbook.close()
4.运行结果如图1-3所示:
图1-3
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/31541436/viewspace-2219842/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/31541436/viewspace-2219842/