两个爬虫记录

两个爬虫记录

第一个

from urllib 
import request
import re
import datetime
# import gzip
# from io import BytesIO

class Spider():
    # now_time = datetime.datetime.now        
    url = 'http://www.shfe.com.cn/news/notice/'    
    # 链接    
    # root_pattern = '<div class="p4 lawbox">([\s\S]*?)</div>'    
    root_pattern = '<li>([\s\S]*?)</li>'    
    # 根目录    
    time_patten = '<span>([\s\S]*?)</span>'    
    # 时间    
    title_pattern = '">([\w\W]*?)</a>'    
    # title       
    def __fetch_content(self):    
    # 1        
    	r = request.urlopen(Spider.url)        
    	htmls = r.read()        
    	htmls = str(htmls,encoding='utf-8')        
    	# print(htmls)        
    	return htmls
    def __analysis(self,htmls):        
    	root_html = re.findall(Spider.root_pattern,htmls)        
    	anchors = []        
    	for html in root_html:            
    	title = re.findall(Spider.title_pattern,html)            
    	time = re.findall(Spider.time_patten,html)            
    	anchor = {'time':time,'title':title}            
    	anchors.append(anchor)        
    	# print(root_html[0])        
    	#print(anchors)        
    	# a = 1        
    	return anchors
    def __refine(self,anchors):        
    	l = lambda anchor:{            
    	'time':anchor['time'][0].strip(),            
    	'title':anchor['title'][0]            
    	}        
    	return map(l,anchors)       
    	# 数据精炼
    	# strip()方法用来去除字符串中的 前后的空格 升序排列
    	# reverse=True 变成降序配列
    def __sort(self,anchors):        
    	anchors = sorted(anchors,key=self.__sort_seed,reverse=True)        
    	return anchors
    	# 业务处理
    	# sorted()方法排序
    	# key用来指定用来比较大小的元素
    def __sort_seed(self,anchor):        
    	return anchor['time']
    	# 时间的比较运算
    def __show(self,anchors):        
    	for anchor in anchors:            
    	print(anchor['title'] + '------' + anchor['time'])# 展示sort

    def go(self):    
    	# 2    
    	# 入口方法        
    	htmls = self.__fetch_content()        
    	# 调用__fetch_content函数        
    	anchors = self.__analysis(htmls)        
    	anchors = list(self.__refine(anchors))        
    	anchors = self.__sort(anchors)        
    	# print(anchors)        
    	self.__show(anchors)  

spider = Spider()
spider.go()

第二个

from urllib 
import request
import re
import datetime

# import gzip
# from io import BytesIO

class Spider():    
	url = 'http://www.cffex.com.cn/jysgg/'    
	# root_pattern_cffex = '<ul class="clearFloat">([\s\S]*?)</ul>'    
	root_pattern_cffex = 'class="time comparetime">([\w\W]*)</a>'    
	time_pattern_cffex = 'class="time comparetime">([\w\W]*?)</a>'    
	title_pattern_cffex = 'title="([\w\W]*?)" >'
	def __fetch_content_cffex(self):        
		r = request.urlopen(Spider.url)        
		htmls = r.read()        
		htmls = str(htmls, encoding='utf-8')        
		# print(htmls)        
		return htmls
    		#    a = 1
	def __analysis_cffex(self, htmls):        
	# 分析        
		root_ceffex = re.findall(Spider.root_pattern_cffex, htmls)        
		anchors = []
	for html in root_ceffex:            
		time = re.findall(Spider.time_pattern_cffex, html)            
		title = re.findall(Spider.title_pattern_cffex, html)            
		anchor = {'title': title,'time': time }            
		anchors.append(anchor)        
	tit = anchor.get('title')        
	tim = anchor.get('time')        
	i = 0        
	while i <len(tit):            
		print(tit[i], end = '\t')            
		print(tim[i])            
		i += 1        
		# print(anchor)        
		return anchors        
		# print(htmls)        
		# a = 1
	def __refine(self, anchors):        
		pass
	def go(self):        
		htmls = self.__fetch_content_cffex()        
		# self.__analysis_cffex(htmls)        
		anchors = self.__analysis_cffex(htmls)        
		self.__refine(anchors)

Spider = Spider()
Spider.go()

po完了 凑合看:)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值