目标:爬取糗事百科的“段子”数据
糗事百科页面:
1、正则方法
# -*- coding: utf-8 -*-
# 正则方法
import requests
import re
import time
base_url = 'https://www.qiushibaike.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
url = 'https://www.qiushibaike.com/text/'
talks = []
page_num = 0
while True: # 若无跳出条件则一直循环
# 1.获取响应
response = requests.get(url=url, headers=headers, verify=False)
text = response.text
# 2. 提取数据
# 1) 先将每个段子部分分别提取出
contents = re.findall(r'<div class="article block untagged mb15.*?>(.*?)<div class="single-clear">', text, re.S)
# 2)从每个段子提取出需要的内容
for content in contents:
talk = {}
user_name = re.findall(r'.*?<h2>(.*?)</h2>', content, re.S)[0].strip() # re.findall返回list,加[0]取出内容,加strip()去除两端空白字符
vote_num = re.findall(r'.*?<i class="number">(.*?)</i>', content, re.S)[0]
comments_num = re.findall(r'.*?<i class="number">(.*?)</i>', content, re.S)[1]
detail_link = base_url + re.findall(r'.*?<a href="(.*?)" target="_blank"', content, re.S)[0].strip()
content = re.findall(r'.*?<div class="content">.*?<span>(.*?)</span>', content, re.S)[0].strip()
content = re.sub('<.*?>', "", content).strip() # 替换掉文中的标签如:<br>
# print(user_name)
talk['user_name'] = user_name
talk['vote_num'] = int(vote_num) # 转为int格式
talk['comments_num'] = int(comments_num) # 转为int格式
talk['content'] = content
talk['detail_link'] = detail_link
talks.append(talk)
page_num += 1 # 计算爬取的页数
next_url_list = base_url + re.findall(r'<li>.*<a href="(.*?)" rel="nofollow">\s<!--<a href', text, re.S) # .*表示贪婪模式,所以结果得出的是最后一个<li>中的内容
# 3.设置跳出条件
if next_url_list == []: # 如果提取不到url,跳出(获取所有页)
break
# if page_num == 4: # 爬取了4页过后结束(爬取指定页数)
# break
# 4.获取下一页url
next_url = base_url + next_url_list[0].strip()
url = next_url
# 5.避免被识别成爬虫
time.sleep(1) # 每爬取一页暂停2秒
# 转换成DataFrame格式
import pandas as pd
result = pd.DataFrame(talks)
2、BeautifulSoup方法
# -*- coding: utf-8 -*-
# BeautifulSoup
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.qiushibaike.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
# 一、获取响应
url = 'https://www.qiushibaike.com/text/'
response = requests.get(url=url, headers=headers, verify=False)
html = response.text
# 二、提取数据
soup = BeautifulSoup(html, 'lxml') # soup为"bs4.BeautifulSoup"类型, BeautifulSoup可以直接传入response
# print(soup.prettify()) # 查看内容,str类型
contents = soup.find_all('div', class_="col1 old-style-col1")[0] # fin_all返回的类型是bs4.element.ResultSet,加上[0]后类型变成bs4.element.Tag
# 1 user_names
user_name = []
user_tags = contents.find_all('h2') # 注意:只有bs4.element.Tag类型才能继续使用find_all
for user_tag in user_tags:
user_name.append(user_tag.string.strip())
# 2 投票数
vote_num=[]
vote_tags = contents.find_all('span', attrs={'class':"stats-vote"})
for vote_tag in vote_tags:
vote_num.append(vote_tag.i.string) # 得到class="stats-vote"的span标签下,子标签i的内容
# 3 评论数
comments_num=[]
comments_tags = contents.find_all('span', attrs={'class':"stats-comments"})
for commentse_tag in comments_tags:
comments_num.append(vote_tag.i.string) # 得到class="stats-comments"的span标签下,子标签i的内容
# 4 链接和内容
details = contents.find_all('a', target="_blank", class_="contentHerf") # 同时满足两个属性的a标签
detail_link = []
content = []
for detail in details:
# 1)链接
detail_link.append(detail['href']) # 在a标签下取href属性
# 2)每段内容
content.append(detail.text.strip()) # 区别:Tag.string()、 Tag.text、 Tag.get_text()、 Tag.get_text
# 组成字典
talks = {
'user_names': user_name,
'vote_num': vote_num,
'comments_num': comments_num,
'detail_link': detail_link,
'content': content
}
# 转换成DataFrame格式
import pandas as pd
result = pd.DataFrame(talks)
3、BeautifulSoup & re
# -*- coding: utf-8 -*-
# BeautifulSoup & re (处理包含特定字段的问题,看最后的重点总结)
import requests
from bs4 import BeautifulSoup
import re
base_url = 'https://www.qiushibaike.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
# 一、获取响应
url = 'https://www.qiushibaike.com/text/'
response = requests.get(url=url, headers=headers, verify=False)
html = response.text
# 2. 提取数据
soup = BeautifulSoup(html, 'lxml') # 这里将‘html’替换成‘response’,会报错
# print(soup.prettify()) # 查看内容,str类型
# 1) 先将每个段子部分分别提取出
contents = soup.find_all('div', class_=re.compile("^article block untagged mb15")) # 与正则和用,查找属性以特定字符开头的 div标签
# 2)从每个段子提取出需要的内容
talks = []
for content in contents:
talk = {}
user_name = content.find_all('h2')[0].string.strip()
vote_num = content.find_all('span', attrs={'class':"stats-vote"})[0].i.string # type为bs4.element.NavigableString
comments_num = content.find_all('span', attrs={'class':"stats-comments"})[0].i.text # 与string结果一样,但type为:str
detail_link = base_url + content.find_all('a', class_="contentHerf")[0]['href']
detail_content = content.find_all('div', class_="content")[0].text.strip() # 此处用string返回None,因为内容里有其他标签(text:返回内容去除标签)
# print(user_name)
talk['user_name'] = user_name
talk['vote_num'] = int(vote_num) # 转为int格式
talk['comments_num'] = int(comments_num) # 转为int格式
talk['detail_content'] = detail_content
talk['detail_link'] = detail_link
talks.append(talk)
# 3 转换成DataFrame格式
import pandas as pd
result = pd.DataFrame(talks)
# =============================================================================
# 重点总结:
#
# 1、查找父节点(这种方式查找next_url太方便了)
# next_url = soup.find_all('span', class_="next")[0].parent['href'] # 返回整个父节点,再在其中查找‘href’
#
# 2、beautifulsoup与正则(re)和用,re查找属性以特定字符开头的 div标签
# contents = soup.find_all('div', class_=re.compile("^article block untagged mb15"))
# =============================================================================