两个爬虫记录

最新推荐文章于 2024-08-20 13:04:52 发布

cocoa_k

最新推荐文章于 2024-08-20 13:04:52 发布

阅读量157

点赞数

分类专栏： python_从入门到放弃：）文章标签： python

本文链接：https://blog.csdn.net/qq_39737339/article/details/106818500

版权

python_从入门到放弃：）专栏收录该内容

2 篇文章 0 订阅

订阅专栏

爬虫test记录

两个爬虫记录
- 第一个
- 第二个

两个爬虫记录

第一个

from urllib 
import request
import re
import datetime
# import gzip
# from io import BytesIO

class Spider():
    # now_time = datetime.datetime.now        
    url = 'http://www.shfe.com.cn/news/notice/'    
    # 链接    
    # root_pattern = '<div class="p4 lawbox">([\s\S]*?)</div>'    
    root_pattern = '<li>([\s\S]*?)</li>'    
    # 根目录    
    time_patten = '<span>([\s\S]*?)</span>'    
    # 时间    
    title_pattern = '">([\w\W]*?)</a>'    
    # title       
    def __fetch_content(self):    
    # 1        
    	r = request.urlopen(Spider.url)        
    	htmls = r.read()        
    	htmls = str(htmls,encoding='utf-8')        
    	# print(htmls)        
    	return htmls
    def __analysis(self,htmls):        
    	root_html = re.findall(Spider.root_pattern,htmls)        
    	anchors = []        
    	for html in root_html:            
    	title = re.findall(Spider.title_pattern,html)            
    	time = re.findall(Spider.time_patten,html)            
    	anchor = {'time':time,'title':title}            
    	anchors.append(anchor)        
    	# print(root_html[0])        
    	#print(anchors)        
    	# a = 1        
    	return anchors
    def __refine(self,anchors):        
    	l = lambda anchor:{            
    	'time':anchor['time'][0].strip(),            
    	'title':anchor['title'][0]            
    	}        
    	return map(l,anchors)       
    	# 数据精炼
    	# strip()方法用来去除字符串中的 前后的空格 升序排列
    	# reverse=True 变成降序配列
    def __sort(self,anchors):        
    	anchors = sorted(anchors,key=self.__sort_seed,reverse=True)        
    	return anchors
    	# 业务处理
    	# sorted()方法排序
    	# key用来指定用来比较大小的元素
    def __sort_seed(self,anchor):        
    	return anchor['time']
    	# 时间的比较运算
    def __show(self,anchors):        
    	for anchor in anchors:            
    	print(anchor['title'] + '------' + anchor['time'])# 展示sort

    def go(self):    
    	# 2    
    	# 入口方法        
    	htmls = self.__fetch_content()        
    	# 调用__fetch_content函数        
    	anchors = self.__analysis(htmls)        
    	anchors = list(self.__refine(anchors))        
    	anchors = self.__sort(anchors)        
    	# print(anchors)        
    	self.__show(anchors)  

spider = Spider()
spider.go()

第二个

from urllib 
import request
import re
import datetime

# import gzip
# from io import BytesIO

class Spider():    
	url = 'http://www.cffex.com.cn/jysgg/'    
	# root_pattern_cffex = '<ul class="clearFloat">([\s\S]*?)</ul>'    
	root_pattern_cffex = 'class="time comparetime">([\w\W]*)</a>'    
	time_pattern_cffex = 'class="time comparetime">([\w\W]*?)</a>'    
	title_pattern_cffex = 'title="([\w\W]*?)" >'
	def __fetch_content_cffex(self):        
		r = request.urlopen(Spider.url)        
		htmls = r.read()        
		htmls = str(htmls, encoding='utf-8')        
		# print(htmls)        
		return htmls
    		#    a = 1
	def __analysis_cffex(self, htmls):        
	# 分析        
		root_ceffex = re.findall(Spider.root_pattern_cffex, htmls)        
		anchors = []
	for html in root_ceffex:            
		time = re.findall(Spider.time_pattern_cffex, html)            
		title = re.findall(Spider.title_pattern_cffex, html)            
		anchor = {'title': title,'time': time }            
		anchors.append(anchor)        
	tit = anchor.get('title')        
	tim = anchor.get('time')        
	i = 0        
	while i <len(tit):            
		print(tit[i], end = '\t')            
		print(tim[i])            
		i += 1        
		# print(anchor)        
		return anchors        
		# print(htmls)        
		# a = 1
	def __refine(self, anchors):        
		pass
	def go(self):        
		htmls = self.__fetch_content_cffex()        
		# self.__analysis_cffex(htmls)        
		anchors = self.__analysis_cffex(htmls)        
		self.__refine(anchors)

Spider = Spider()
Spider.go()

po完了凑合看：）

cocoa_k

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
两个爬虫记录

爬虫test记录两个爬虫记录第一个第二个两个爬虫记录第一个from urllib import requestimport reimport datetime# import gzip# from io import BytesIOclass Spider(): # now_time = datetime.datetime.now url = 'http://www.shfe.com.cn/news/notice/' # 链接
复制链接

扫一扫