import urllib.request,re,time,random,gzip
from bs4 import BeautifulSoup
def savefile(data,idx):
path='d:\\u\\o_'+str(idx+1)+'.txt'
file=open(path,'wb')
page='当前页:'+str(idx+1)+'\n'
file.write(page.encode('gbk'))
for d in data:
d=str(d)+'\n'
file.write(d.encode('gbk'))
file.close()
def ungzip(data):
try:
data=gzip.decompress(data)
except:
print('没解压')
return data
class csdnspider:
def __init__(self,pageidx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):
self.pageidx=pageidx
self.url=url[0:url.rfind('/')+1]
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Host": "blog.csdn.net"
}
def getpages(self):
req=urllib.request.Request(url=self.url,headers=self.headers)
res=urllib.request.urlopen(req)
data=res.read()
dat=ungzip(data)
data=dat.decode('utf-8')
soup=BeautifulSoup(data,'html5lib')
tag=soup.find('div',"pagelist")
pagedata=tag.span.get_text()
pagesNum=re.findall(r'共(.*?)页',pagedata)[0]
return pagesNum
#def setpage(self,idx):
# self.url=self.url[0:self.url.rfind('/')+1]+str[idx]
def setpage(self,idx):
self.url = self.url[0:self.url.rfind('/')+1]+str(idx)
def readData(self):
ret=[]
req=urllib.request.Request(url=self.url,headers=self.headers)
res=urllib.request.urlopen(req)
data=res.read()
dat=ungzip(data)
data=dat.decode('utf-8')
soup=BeautifulSoup(data,'html5lib')
items=soup.find_all('div',"list_item article_item")
for item in items:
title=item.find('span',"link_title").a.get_text()
link=item.find('span',"link_title").a.get('href')
writetime=item.find('span',"link_postdate").get_text()
readers=item.find('span',"link_view").get_text()
comments=item.find('span',"link_comments").get_text()
ret.append('日期:'+writetime+'\n标题'+title+'\n链接'+link+'\n阅读'+readers+'\t评论'+comments+'\n')
return ret
cs=csdnspider()
pagesNum=int(cs.getpages())
print('总页数:',pagesNum)
for idx in range(pagesNum):
cs.setpage(idx)
print('当前页',idx+1)
papers=cs.readData()
savefile(papers,idx)
from bs4 import BeautifulSoup
def savefile(data,idx):
path='d:\\u\\o_'+str(idx+1)+'.txt'
file=open(path,'wb')
page='当前页:'+str(idx+1)+'\n'
file.write(page.encode('gbk'))
for d in data:
d=str(d)+'\n'
file.write(d.encode('gbk'))
file.close()
def ungzip(data):
try:
data=gzip.decompress(data)
except:
print('没解压')
return data
class csdnspider:
def __init__(self,pageidx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):
self.pageidx=pageidx
self.url=url[0:url.rfind('/')+1]
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"Host": "blog.csdn.net"
}
def getpages(self):
req=urllib.request.Request(url=self.url,headers=self.headers)
res=urllib.request.urlopen(req)
data=res.read()
dat=ungzip(data)
data=dat.decode('utf-8')
soup=BeautifulSoup(data,'html5lib')
tag=soup.find('div',"pagelist")
pagedata=tag.span.get_text()
pagesNum=re.findall(r'共(.*?)页',pagedata)[0]
return pagesNum
#def setpage(self,idx):
# self.url=self.url[0:self.url.rfind('/')+1]+str[idx]
def setpage(self,idx):
self.url = self.url[0:self.url.rfind('/')+1]+str(idx)
def readData(self):
ret=[]
req=urllib.request.Request(url=self.url,headers=self.headers)
res=urllib.request.urlopen(req)
data=res.read()
dat=ungzip(data)
data=dat.decode('utf-8')
soup=BeautifulSoup(data,'html5lib')
items=soup.find_all('div',"list_item article_item")
for item in items:
title=item.find('span',"link_title").a.get_text()
link=item.find('span',"link_title").a.get('href')
writetime=item.find('span',"link_postdate").get_text()
readers=item.find('span',"link_view").get_text()
comments=item.find('span',"link_comments").get_text()
ret.append('日期:'+writetime+'\n标题'+title+'\n链接'+link+'\n阅读'+readers+'\t评论'+comments+'\n')
return ret
cs=csdnspider()
pagesNum=int(cs.getpages())
print('总页数:',pagesNum)
for idx in range(pagesNum):
cs.setpage(idx)
print('当前页',idx+1)
papers=cs.readData()
savefile(papers,idx)