# -*- coding:utf-8 -*-
import urllib2,urllib,re,string,thread
class qqZone:
def __init__(self,myurl):
self.url=myurl
self.infos=[] #通过正则,有选择的产生的数据
self.title=''
self.page='' #由该url产生的所有数据
def getMain(self):
self.page=self.getPage(self.url)
self.getNeed(self.page)
myCount=self.getPageCount(self.page)
print u"myTitle is:"+self.title
print u"myInfos is:"+self.infos
#2、输出int型格式与str不同,否则会报错。
print "count is %d" % myCount
def saveInfo(self):
filename=self.title+'.txt'
file=open(filename,'w+')
file.write(self.infos)
#allinfos=self.getAllInfos(self.page)
#file.write(allinfos)
file.close()
#1、加了headers以后,就可以获取到Title了。。。。YES!!!!!!!!!!!!!
# qq空间中的东西爬不到啊。。。
#3、infos=res.read().decode('utf-8'):从而解决了乱码问题。。。。(好久才发现!!!)
def getPage(self,myurl):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers ={'User-Agent' : user_agent}
req=urllib2.Request(myurl,headers=headers)
res=urllib2.urlopen(req)
infos=res.read().decode('utf-8')
return infos
#获取空间标题
#<h1 class="head-title" id="top_head_title"><span class="title-text">无限之声</span>
#<h1.*?class="head-title".*?id="top_head_title"><span.*?class="title-text">(.*?)</span>
#<a href="http://blog.csdn.net/pleasecallmewhy">汪海的实验室</a>
# <span class="link_title"><a href="/pleasecallmewhy/article/details/24419023">
#[NodeJS]使用Node.js写一个简单的在线聊天室
# </a></span>
def getNeed(self,infos):
#strMatch1=re.search(r'<title>(.*?)</title>',infos,re.S)
#strMatch1=re.search(r'<span class="title-text">(.*?)</span>',infos,re.S)
titleMatch=re.search(r'<a.*?>(.*?)</a>',infos,re.S)
infosMatch=re.search(r'<span class="link_title"><a.*?>(.*?)</a></span>',infos,re.S)
if titleMatch:
self.title=titleMatch.group(1)
else:
self.title=u"暂无标题"
self.title = self.title.replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','')
if infosMatch:
self.infos=infosMatch.group(1)
else:
self.infos=u"暂无内容"
def getAllInfos(self,infos):
myItems=re.findall(r'<span class="link_title"><a.*?>(.*?)</a></span>',infos,re.S)
items=[]
for item in myItems:
items.append(item[0].replace("\n",""))
return items
#获取总共的页数 (后来不知道为啥获取不到了)
#<span> 301条数据 共7页</span>
def getPageCount(self,infos):
countMatch=re.search(r'<span>.*?共(\d+?)页</span>',infos,re.S)
if countMatch:
myCount=int(countMatch.group(1))
else:
myCount=0
return myCount
print u"请输入地址:"
myurl=str(raw_input(' '))
zone=qqZone(myurl)
zone.getMain()
zone.saveInfo()
import urllib2,urllib,re,string,thread
class qqZone:
def __init__(self,myurl):
self.url=myurl
self.infos=[] #通过正则,有选择的产生的数据
self.title=''
self.page='' #由该url产生的所有数据
def getMain(self):
self.page=self.getPage(self.url)
self.getNeed(self.page)
myCount=self.getPageCount(self.page)
print u"myTitle is:"+self.title
print u"myInfos is:"+self.infos
#2、输出int型格式与str不同,否则会报错。
print "count is %d" % myCount
def saveInfo(self):
filename=self.title+'.txt'
file=open(filename,'w+')
file.write(self.infos)
#allinfos=self.getAllInfos(self.page)
#file.write(allinfos)
file.close()
#1、加了headers以后,就可以获取到Title了。。。。YES!!!!!!!!!!!!!
# qq空间中的东西爬不到啊。。。
#3、infos=res.read().decode('utf-8'):从而解决了乱码问题。。。。(好久才发现!!!)
def getPage(self,myurl):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers ={'User-Agent' : user_agent}
req=urllib2.Request(myurl,headers=headers)
res=urllib2.urlopen(req)
infos=res.read().decode('utf-8')
return infos
#获取空间标题
#<h1 class="head-title" id="top_head_title"><span class="title-text">无限之声</span>
#<h1.*?class="head-title".*?id="top_head_title"><span.*?class="title-text">(.*?)</span>
#<a href="http://blog.csdn.net/pleasecallmewhy">汪海的实验室</a>
# <span class="link_title"><a href="/pleasecallmewhy/article/details/24419023">
#[NodeJS]使用Node.js写一个简单的在线聊天室
# </a></span>
def getNeed(self,infos):
#strMatch1=re.search(r'<title>(.*?)</title>',infos,re.S)
#strMatch1=re.search(r'<span class="title-text">(.*?)</span>',infos,re.S)
titleMatch=re.search(r'<a.*?>(.*?)</a>',infos,re.S)
infosMatch=re.search(r'<span class="link_title"><a.*?>(.*?)</a></span>',infos,re.S)
if titleMatch:
self.title=titleMatch.group(1)
else:
self.title=u"暂无标题"
self.title = self.title.replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','')
if infosMatch:
self.infos=infosMatch.group(1)
else:
self.infos=u"暂无内容"
def getAllInfos(self,infos):
myItems=re.findall(r'<span class="link_title"><a.*?>(.*?)</a></span>',infos,re.S)
items=[]
for item in myItems:
items.append(item[0].replace("\n",""))
return items
#获取总共的页数 (后来不知道为啥获取不到了)
#<span> 301条数据 共7页</span>
def getPageCount(self,infos):
countMatch=re.search(r'<span>.*?共(\d+?)页</span>',infos,re.S)
if countMatch:
myCount=int(countMatch.group(1))
else:
myCount=0
return myCount
print u"请输入地址:"
myurl=str(raw_input(' '))
zone=qqZone(myurl)
zone.getMain()
zone.saveInfo()