# -*- coding: UTF-8 -*- import urllib2 import re import ssl import sys if __name__ == "__main__": #代理 proxy = { 'http': 'xxx', 'https': 'xxx' } ssl_context = ssl._create_unverified_context() https_handler = urllib2.HTTPSHandler(context=ssl_context) url = "https://www.i7wx.com/book/0/636/" proxy_handler = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_handler, https_handler) response = opener.open(url) # print response.read().decode('gbk') pattern = re.compile(r'<a href="(\d*.html)">(.*?)</a>', re.I) result = pattern.findall(response.read().decode('gbk')) for k, v in result: # print k, v with open("./novel/" + v + ".txt", 'w') as f: r = opener.open(url + k) pattern = re.compile(r'<div id="content">(.*?)</div>') # print r.read().decode('gbk') match = pattern.findall(r.read().decode('gbk')) p2 = re.compile(r'(<br/><br/>)') p3 = re.compile(r' ') if match: sub = p2.sub('\n', match[0]) sub = p3.sub('', sub) print sub f.write(sub.encode('utf-8')) f.close()
urllib2爬取小说三寸人间
最新推荐文章于 2022-07-29 15:06:37 发布