1 import re 2 import requests 3 4 class Spider(): 5 def __init__(self): 6 self.page=self.page 7 #self.switch=True 8 9 10 def loadpage(self,page): 11 url='http://www.neihan8.com/article/index_'+str(page)+'.html' 12 headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 13 response=requests.get(url,headers=headers) 14 html=response.text 15 pattern=re.compile('<div\sclass="desc"(.*?)</div>',re.S) 16 content_list=pattern.findall(html) 17 self.dealpage(content_list) 18 19 def dealpage(self,content_list): 20 for item in content_list: 21 self.write(item) 22 23 def write(self,item): 24 with open("duanzi.txt","a") as f: 25 b=item.encode('utf-8') 26 f.write(str(b)) 27 28 def start(self): 29 #d=True 30 #command=input("继续:(退出quit)") 31 page=3 32 self.loadpage(page) 33 #page=4 34 35 36 37 38 39 if __name__=="__main__": 40 duanzi=Spider() 41 duanzi.start()