python爬虫实战-爬取励志名言并保存至本地(正则)
import urllib.parse
import urllib.request
import re
def handle_request(url,page=None):
if page !=None:
url=url+str(page)+'.html'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
request=urllib.request.Request(url=url,headers=headers)
return request
def get_text(a_href):
request=handle_request(a_href)
text=urllib.request.urlopen(request).read().decode()
pattern=re.compile(r'<div class="neirong">(.*?)</div>',re.S)
let=pattern.findall(text)
pat=re.compile(r'<img .*?>',re.S)
text=pat.sub('',let[0])
return(text)
def get_content(lt):
for href_title in lt:
a_href='http://www.yikexun.cn'+href_title[0]
title=href_title[1]
text=get_text(a_href)
string='<h1>%s</h1>%s' % (title,text)
with open('lizhi1.html','a',encoding='utf8')as fp:
fp.write(string)
def parse_content(content):
pattern=re.compile(r'<h3><a href="(.*?)"><b>(.*?)</b></a></h3>')
lt=pattern.findall(content)
get_content(lt)
def main():
url='http://www.yikexun.cn/lizhi/qianming/list_50_'
start_page=int(input('请输入起始页码:'))
end_page=int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
request=handle_request(url,page)
content=urllib.request.urlopen(request).read().decode()
parse_content(content)
if __name__ == '__main__':
main()