使用python备份搜狐博客

博客日落西山,已经是不争的事实了,只怕哪天会停掉的。
用python把旧的内容做下备份。

# -*- coding:utf-8 -*-

import urllib.request
from urllib import request
from bs4 import BeautifulSoup
import sqlite3

domain="TTTT";####此处修改为你的博客域名
url = "http://"+domain+".blog.sohu.com/entry/"
urlFile = urllib.request.urlopen(url)
data = urlFile.read()
urlFile.close()
data = data.decode('utf-8',errors='ignore')
print("get page success")
pre = "var _ebi = \'"
index1 = data.find(pre) + len(pre)
index2 = data.find('\'', index1)

ebi=data[index1 : index2];
print("ebi:"+ebi)

pre = "var totalCount = "
index1 = data.find(pre) + len(pre)
index2 = data.find(';', index1)
print("totalcount:"+data[index1 : index2])
totalPage="";
if (int(data[index1 : index2]))%20>0:
totalPage=str(int(int(data[index1 : index2])/20+1))
else:
totalPage=str(int(int(data[index1 : index2])/20))
print("totalpage:"+totalPage);


def getBlogList(pageId):
url="http://"+domain+".blog.sohu.com/action/v_frag-ebi_"+ebi+"-pg_"+pageId+"/entry/";

print("get url:"+url);
#1.获取页面内容html
with request.urlopen(url) as f:
html_doc=f.read()
html_doc = html_doc.decode('utf-8',errors='ignore')

#2.分析页面内容,获取标题内容和链接[格式如下]
#<h2 class="news_entry">
# <a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a>
#</h2>
soup = BeautifulSoup(html_doc,"html.parser")
news_array=soup.find_all('div', {'class': 'newBlog-list-title'})
for news in news_array:
if news.a:
print(news.a.get("href"))#获取链接
save(news.a.get("href"))
#print(news.a.string)#获取标题

def save(link,title=None):
if title is None:
title=""
conn = sqlite3.connect('blog.db')
cursor = conn.cursor()
# 执行一条SQL语句,创建user表:
cursor.execute('create table IF NOT EXISTS blog (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,postdate varchar(100),status Integer)')
cursor.execute('select * from blog where link=\''+link+'\'')
values=cursor.fetchall()
if len(values) > 0:#链接以前就存在
print('链接已经存在:'+link)
else:
cursor.execute('insert into blog (title, link,status) values (\''+title+'\', \''+link+'\',0)')
conn.commit()
print("save success."+link)
# 关闭Cursor:
cursor.close()
# 提交事务:
conn.commit()
# 关闭Connection:
conn.close()

for x in range(1,int(totalPage)+1): #代表从1到5(不包含5)9000-9700
errorLink=[]
try:
getBlogList(str(x))
except Exception as e:
print('except:', e)
errorLink.append(x)
print("errorLink:"+str(errorLink));




2.抓取内容页面,将内容保存到数据库中

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3


def updateContent():
conn = sqlite3.connect('blog.db')
cursor = conn.cursor()
cursor.execute('select * from blog where status=0')
values = cursor.fetchall()

for line in values:
id=line[0]
link=line[2]

soup=getContent(link)

try:
title=soup.find('div', {'class': 'item-body'}).h2.span.get_text()
postdate=soup.find('span', {'class': 'date'}).get_text();
content=str(soup.find('div', {'class': 'item-content'}))#.get_text()
end = "<div class=\"clear\"></div>"
content=content[45:content.find(end)]

print(link)
cursor.execute('update blog set title=?,content=?,status=1,postdate=? where id=?',(title,content, postdate,id))
conn.commit()
except Exception as e:
print('except:', e)
cursor.close()
conn.commit()
conn.close()


#根据链接获取内容
def getContent(link):
#1.获取页面内容html
html_doc="";
#构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0',
'Referer' : link}

#打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功)

try:
#with request.urlopen(link) as f:
# html_doc=f.read()
request = urllib.request.Request(link, None, headers)
html_doc=urllib.request.urlopen(request).read()
except Exception as e:
print('except:', e)

#2.分析页面内容,获取内容
soup = BeautifulSoup(html_doc,"html.parser")
return soup

#将所有没有内容的新闻,抓取一下,将内容填充进去
updateContent()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值