爬虫：爬取某古诗词网站的页面内全部作者的诗句_爬取网页,获取所有王姓诗人的诗类型、诗名和作者名(同一作者的诗排列在一起),再对-CSDN博客

本文链接：https://blog.csdn.net/qq_46105093/article/details/123694672

代码

代码如下（示例）：

import parsel,requests,re
def pa_dan(zuoze_yemian,filename):

  url=zuoze_yemian#'https://www.shicimingju.com/chaxun/zuozhe/5.html'#https://www.gxlcms.com/JavaScript-231573.html
  # print(url,type(b))
  headers={

      'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
      
  }
  arr=[]
  res=requests.get(url=url,headers=headers)
  res.encoding=res.apparent_encoding
  # print(res.text,res.cookies,res.headers)
  select=parsel.Selector(res.text)
  obj=re.findall('html"\starget="_blank">《(.*?)》.*\n.*content">\n(.*)\n.*\n.*?(.*)<br />',res.text)
  #print(obj)

          
                                            
                          
  title=re.findall('<a href="/chaxun/zuozhe/\d+.html">(.*?)</a></div>',res.text)#head title
  #有展开项
  obj1=re.findall('html"\starget="_blank">《(.*?)》.*\n.*content">\n\s*(.*)<br />(.*?)<br />.*\n.*\n\s*(.*)<br />',res.text)#<br />(.*?)
  with open('/Users/python/shige/{}.py'.format(title[0]), 'w', encoding='utf-8', errors='ignore') as f:
    f.write('class'+' '+str(title[0])+':'+'\n')
    f.write('\t'+'def __init__(self):'+'\n')
    for i in obj1:
      #print(i)
      e=i[1]+i[2]+''.join(i[3].split('<br />'))
      #print(type(e))
      # print('class'+' '+str(title[0])+':'+'\n'+'\t\t'+'self.'+str(i[0])+'='+str(e))

      zhong='\t\t'+'self.'+str(i[0])+'=\''+str(e)+'\'\n'
      #print(zhong)
      f.write(zhong)





  #没展开
  #import module_pa
  #mkdoc('/Users/python/shige','shiren.py')
    obj2=re.findall('html"\starget="_blank">《(.*?)》.*\n.*content">\n\s*(.*?)\s*</div>',res.text)
    for j in obj2:
      e1=''.join(j[1].split('<br />'))
      zhong='\t\t'+'self.'+str(j[0])+'=\''+str(e1)+'\'\n'
      f.write(zhong)
    
    
    
  f.close()



if __name__=='__main__':
  headers={

      'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
      
  }
  arr=[]
  res=requests.get(url='https://www.shicimingju.com/chaxun/zuozhe/5.html',headers=headers)
  res.encoding=res.apparent_encoding
  zuoze=re.findall('<li><a href="(/chaxun/zuozhe.*?)">(.*?)</a></li>',res.text)
  #print(zuoze)


  for k in zuoze:

    url='https://www.shicimingju.com'+k[0]
    pa_dan(url,k[1])