import urllib.request
import re
def getSentence(data):
partern = r'<span>(.*?)</span>'
strlist = re.findall(partern,data)
return strlist
url = "http://www.qiushibaike.com/"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
headers = { 'User-Agent' : user_agent,
'If-None-Match' : "6e0f605c36f86beeee986d350bf78be89d606ee5"}
try:
req = urllib.request.Request(url,headers = headers)
result = urllib.request.urlopen(req).read()
strlist = getSentence(result.decode() )
foo = open("qsbk.txt","w")
for i in strlist:
if(i.startswith('<img') == False):
foo.write(i)
foo.write("\n\n")
except urllib.request.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
参照希望姐的笔记写的,自己改动了一下
http://cuiqingcai.com/990.html