目标:爬取糗事百科的“段子”数据
糗事百科页面:
1、xpath方法
# 代码一:xpath方法
import requests
from lxml import etree
url = 'https://www.qiushibaike.com/text/'
base_url = 'https://www.qiushibaike.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
response = requests.get(url=url, headers=headers, verify=False)
#response.text
html = etree.HTML(response.text)
#etree.tostring(html, encoding='utf8').decode('utf8')
articles = html.xpath('//div[contains(@class, "article block untagged mb15")]')
talks = []
for article in articles:
talk = {}
user_name = article.xpath('.//h2/text()')[0].strip()
supporting_num = article.xpath('.//span[@class="stats-vote"]/i/text()')[0]
content = article.xpath('string(.//div[@class="content"]/span[1])').strip()
detail_link = base_url + article.xpath('./a[1]/@href')[0]
# print(link)
talk['user_name'] = user_name
talk['supporting_num'] = int(supporting_num) # 转为int格式
talk['content'] = content
talk['detail_link'] = detail_link
talks.append(talk)
# 转换成DataFrame格式
import pandas as pd
result = pd.DataFrame(talks)
2、正则方法
# 代码二:正则方法
import requests
import re
url = 'https://www.qiushibaike.com/text/'
base_url = 'https://www.qiushibaike.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
response = requests.get(url=url, headers=headers, verify=False)
text = response.text
# 先将每个段子部分分别提取出
contents = re.findall(r'<div class="article block untagged mb15.*?>(.*?)<div class="single-clear">', text, re.S)
# 从每个段子提取出需要的内容
talks = []
for content in contents:
talk = {}
user_name = re.findall(r'.*?<h2>(.*?)</h2>', content, re.S)[0].strip() # re.findall返回list,加[0]取出内容,加strip()去除两端空白字符
vote_num = re.findall(r'.*?<i class="number">(.*?)</i>', content, re.S)[0]
comments_num = re.findall(r'.*?<i class="number">(.*?)</i>', content, re.S)[1]
detail_link = base_url + re.findall(r'.*?<a href="(.*?)" target="_blank"', content, re.S)[0].strip()
content = re.findall(r'.*?<div class="content">.*?<span>(.*?)</span>', content, re.S)[0].strip()
# print(user_name)
talk['user_name'] = user_name
talk['vote_num'] = int(vote_num) # 转为int格式
talk['comments_num'] = int(comments_num) # 转为int格式
talk['content'] = content
talk['detail_link'] = detail_link
talks.append(talk)
# 转换成DataFrame格式
import pandas as pd
result = pd.DataFrame(talks)