爬虫练习一

最新推荐文章于 2024-09-24 16:30:29 发布

划过天空

最新推荐文章于 2024-09-24 16:30:29 发布

阅读量326

点赞数

分类专栏： Python学习文章标签： python 爬虫个人自学

本文链接：https://blog.csdn.net/qq_32895695/article/details/49894605

版权

Python学习专栏收录该内容

11 篇文章 0 订阅

订阅专栏

# -*- coding:utf-8 -*-
import urllib2,urllib,re,string,thread

class qqZone:
 def __init__(self,myurl):
 self.url=myurl
 self.infos=[] #通过正则，有选择的产生的数据
 self.title=''
 self.page='' #由该url产生的所有数据

 def getMain(self):
 self.page=self.getPage(self.url)
 self.getNeed(self.page)
 myCount=self.getPageCount(self.page)
 print u"myTitle is:"+self.title
 print u"myInfos is:"+self.infos
 #2、输出int型格式与str不同，否则会报错。
 print "count is %d" % myCount

 def saveInfo(self):
 filename=self.title+'.txt'
 file=open(filename,'w+')
 file.write(self.infos)
 #allinfos=self.getAllInfos(self.page)
 #file.write(allinfos)
 file.close()

 #1、加了headers以后，就可以获取到Title了。。。。YES!!!!!!!!!!!!!
 # qq空间中的东西爬不到啊。。。
 #3、infos=res.read().decode('utf-8')：从而解决了乱码问题。。。。（好久才发现！！！）
 def getPage(self,myurl):
 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 headers ={'User-Agent' : user_agent}
 req=urllib2.Request(myurl,headers=headers)
 res=urllib2.urlopen(req)
 infos=res.read().decode('utf-8')
 return infos

 #获取空间标题
 #<h1 class="head-title" id="top_head_title">无限之声
 #<h1.*?class="head-title".*?id="top_head_title"><span.*?class="title-text">(.*?)
 #<a href="http://blog.csdn.net/pleasecallmewhy">汪海的实验室</a>
 # <a href="/pleasecallmewhy/article/details/24419023">
 #[NodeJS]使用Node.js写一个简单的在线聊天室
 # </a>
 def getNeed(self,infos):
 #strMatch1=re.search(r'<title>(.*?)</title>',infos,re.S)
 #strMatch1=re.search(r'(.*?)',infos,re.S)
 titleMatch=re.search(r'<a.*?>(.*?)</a>',infos,re.S)
 infosMatch=re.search(r'<a.*?>(.*?)</a>',infos,re.S)

 if titleMatch:
 self.title=titleMatch.group(1)
 else:
 self.title=u"暂无标题"
 self.title = self.title.replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','')

 if infosMatch:
 self.infos=infosMatch.group(1)
 else:
 self.infos=u"暂无内容"

 def getAllInfos(self,infos):
 myItems=re.findall(r'<a.*?>(.*?)</a>',infos,re.S)
 items=[]
 for item in myItems:
 items.append(item[0].replace("\n",""))
 return items

 #获取总共的页数（后来不知道为啥获取不到了）
 # 301条数据 共7页
 def getPageCount(self,infos):
 countMatch=re.search(r'.*?共(\d+?)页',infos,re.S)
 if countMatch:
 myCount=int(countMatch.group(1))
 else:
 myCount=0
 return myCount

print u"请输入地址："
myurl=str(raw_input(' '))
zone=qqZone(myurl)
zone.getMain()
zone.saveInfo()