python爬虫实战-爬取励志名言并保存至本地（正则）

最新推荐文章于 2021-02-26 15:19:27 发布

hoochon

最新推荐文章于 2021-02-26 15:19:27 发布

阅读量791

点赞数

分类专栏： python实战文章标签： python 爬虫

本文链接：https://blog.csdn.net/hoochon/article/details/87893191

版权

python实战专栏收录该内容

13 篇文章 1 订阅

订阅专栏

python爬虫实战-爬取励志名言并保存至本地（正则）

import urllib.parse
import urllib.request
import re

def handle_request(url,page=None):
	if page !=None:
		url=url+str(page)+'.html'
	headers={
		'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	}
	request=urllib.request.Request(url=url,headers=headers)
	return request

def get_text(a_href):
	request=handle_request(a_href)
	text=urllib.request.urlopen(request).read().decode()
	pattern=re.compile(r'<div class="neirong">(.*?)</div>',re.S)
	let=pattern.findall(text)
	#清除内容里面的所有图片
	pat=re.compile(r'<img .*?>',re.S)
	text=pat.sub('',let[0])
	return(text)

def get_content(lt):
	for href_title in lt:
		a_href='http://www.yikexun.cn'+href_title[0]
		title=href_title[1]
		text=get_text(a_href)
		string='<h1>%s</h1>%s' % (title,text)
		with open('lizhi1.html','a',encoding='utf8')as fp:
			fp.write(string)

def parse_content(content):
	pattern=re.compile(r'<h3><a href="(.*?)"><b>(.*?)</b></a></h3>')
	#返回的lt是一个列表，列表每个元素都是一个元祖，元祖的第一个元素就是url地址，第二个元素是标题
	lt=pattern.findall(content)
	get_content(lt)

def main():
	# url='http://www.yikexun.cn/lizhi/qianming/list_50_2.html'
	url='http://www.yikexun.cn/lizhi/qianming/list_50_'
	start_page=int(input('请输入起始页码：'))
	end_page=int(input('请输入结束页码：'))
	for page in range(start_page,end_page+1):
		request=handle_request(url,page)
		content=urllib.request.urlopen(request).read().decode()
		parse_content(content)

if __name__ == '__main__':
	main()