爬取糗事百科的段子(正则表达式)
import requests
import re
from bs4 import BeautifulSoup
def judge_sex(sex):
if sex == "womenIcon":
return '女'
else:
return '男'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
def Get_text(url):
res = requests.get(url, headers=headers)
info_lists = []
ids = re.findall('<h2>(.*?)</h2>',res.text,re.S)
sexs = re.findall('<div class="articleGender (.*?)">21</div>',res.text)
ages = re.findall('<div class="articleGender .*?">(.*?)</div>',res.text)
contents = re.findall('<span>(.*?)</span>',res.text,re.S)
for id, content, sex, age in zip(ids,contents,sexs,ages):
info = {
'id' : id,
'sex': judge_sex(sex),
'age': age,
'content': content
}
info_lists.append(info)
for info_list in info_lists:
print(info_list['id'])
print(info_list['sex'])
print(info_list['age'])
for i in (info_list['content'].split("<br/>")):
print(i)
urls = ['https://www.qiushibaike.com/text/page/{}'.format(str(i)) for i in range(1,25)]
for url in urls:
Get_text(url)
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/364347e59d21a1894e99a371eabb050f.png)