糗事百科前十页保存到txt

最新推荐文章于 2022-09-17 15:22:12 发布

weixin_30206835

最新推荐文章于 2022-09-17 15:22:12 发布

阅读量211

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_30206835/article/details/79109709

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

 
  import requests 
 
  from bs4 
  import BeautifulSoup 
 
  # -*-coding:utf-8 -*- 
 
  import io 
 
  import sys 
 
  #改变标准输出的默认编码 
 
   sys.stdout=io.TextIOWrapper(sys.stdout.buffer, 
  encoding= 
  'utf8') 
 
  # vscode打印中文问题 
 
   headers = { 
  'User-Agent': 
   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'} 
 
   base_url = 
  'https://www.qiushibaike.com/8hr/page/'   
  # 设定一个网址不变的部分，然后我们只要每次在这个后面加数字就可以了 
 
  file = 
  open( 
  'qiubai.txt', 
  'w') 
 
   num = 
  1 
 
  while 
  1: 
 
  file.write( 
  '第' + 
  str(num) + 
  '页' + 
  ' 
  \n 
  ') 
 
       r = requests.get(base_url + 
  str(num), 
  headers = headers) 
 
  # r = requests.get('https://www.qiushibaike.com', headers = headers) 
 
       content = r.text 
 
       soup = BeautifulSoup(content, 
  'lxml') 
 
       divs_hot = soup.find_all( 
  class_ = 
   'article block untagged mb15 typs_hot') 
 
  for div 
  in divs_hot: 
 
  if div.find_all( 
  class_ = 
  'thumb'): 
 
  continue 
 
           joke = div.span.get_text() 
 
  # print('--热门--') 
 
  # print(joke) 
 
  # print('------') 
 
  file.write( 
  '--热门--') 
 
  file.write(joke + 
   ' 
  \n 
  ') 
 
  file.write( 
  '------' + 
  ' 
  \n 
  ') 
 
       divs_long = soup.find_all( 
  class_ = 
   'article block untagged mb15 typs_long') 
 
  for div 
  in divs_long: 
 
  if div.find_all( 
  class_ = 
  'thumb'): 
 
  continue 
 
           joke = div.span.get_text() 
 
  # print('--长篇--') 
 
  # print(joke) 
 
  # print('------') 
 
  file.write( 
  '--长篇--') 
 
  file.write(joke + 
   ' 
  \n 
  ') 
 
  file.write( 
  '------' + 
  ' 
  \n 
  ') 
 
       divs_recent = soup.find_all( 
  class_ = 
   'article block untagged mb15 typs_recent') 
 
  for div 
  in divs_recent: 
 
  if div.find_all( 
  class_ = 
  'thumb'): 
 
  continue 
 
           joke = div.span.get_text() 
 
  # print('--近期--') 
 
  # print(joke) 
 
  # print('------') 
 
  file.write( 
  '--近期--') 
 
  file.write(joke + 
   ' 
  \n 
  ') 
 
  file.write( 
  '------' + 
  ' 
  \n 
  ') 
 
       divs_old = soup.find_all( 
  class_ = 
   'article block untagged mb15 typs_old') 
 
  for div 
  in divs_old: 
 
  if div.find_all( 
  class_ = 
  'thumb'): 
 
  continue 
 
           joke = div.span.get_text() 
 
  # print('--经典--') 
 
  # print(joke) 
 
  # print('------') 
 
  file.write( 
  '--经典--') 
 
  file.write(joke + 
   ' 
  \n 
  ') 
 
  file.write( 
  '------' + 
  ' 
  \n 
  ') 
 
  print( 
  '第 
  {} 
  页，完成！'.format(num)) 
 
       num += 
  1 
 
  if num > 
   10: 
 
  break 
 
  file.close