# coding:utf-8
import urllib2
import re
import time
class Spider(object):
def __init(self):
pass
def loadPage(self):
startNum = int(raw_input("请输入起始页号:"))
endNum = int(raw_input("请输入结束页号:"))
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}
for num in range(startNum, endNum + 1):
if num == 1:
url = "http://www.neihan8.com/article/index.html"
else:
url = "http://www.neihan8.com/article/index_%s.html"%str(num)
print url
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
#print response.read()
html = response.read()
pattern = re.compile('<div\sclass="desc">(.*?)</div>',re.S)
content_list = pattern.findall(html)
self.writePage(content_list)
def writePage(self,content_list):
with open("duanzi.txt","a") as f:
for content in content_list:
f.write(content+"\r\n\r\n")
if __name__ == "__main__":
Spider().loadPage()
neihan8段子爬取
最新推荐文章于 2021-06-07 18:17:12 发布