最近在做自媒体相关的事情,但是苦于手中没有数据,我要去爬取数据,那么我选择的编程语言是python。
直接介绍如何爬数据,首先第一步我们需要先在网页上登陆微博,然后取得登录后需要用的东西Cookie
想知道如何取的cookie吗?自己点击超链接,虽然里面没事直接说如何取cookie,但是你可以通过学习上篇文章自己学习一下。
其次我们这里用的数据库是mysql,请自行安装
创建数据库表如下图表数据结构
有空我在把过程补全吧
直接上代码
weibobase.py这个是获取数据de'dui'x
#!/usr/bin/python
# -*- coding: UTF-8 -*-
class wb_uset():
avatar_hd = "" # 用户头像
description = "" # 用户简介
screen_name = "" # 用户名字
profile_url = "" # 用户主页地址
followers_count = "" # 用户粉丝数量
follow_count = "" # 用户关注数量
id = "" # 用户id
class wb_mblog():
id=""#微博id
created_at=""#发表时间
scheme = "" # 单条的地址
text = "" # 内容
attitudes_count = "" # 点赞
comments_count = "" # 评论
reposts_count = "" # 转发
source = "" # 终端
stream_url = "" # 视频地址
page_url = "" # 视频播放地址
obj_ext = "" # 播放次数
image_urls = [] # 图片集合
MYSQL.py 这个是关于数据库操作的类
这里面主要是保存数据,更新数据的操作
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import weibobase
import MySQLdb
import re
db = MySQLdb.connect("localhost", "root", "xxxxxx", "new_webei", use_unicode=True, charset="utf8")
def saveMblog(self, myitem=weibobase.wb_mblog):
cursor = db.cursor()
try:
sql = "INSERT INTO weibo_mblog(mblog_id,\
created_at, scheme, text, attitudes_count,comments_count,reposts_count,source,stream_url,page_url,obj_ext) \
VALUES ('%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s' )" % \
(myitem.id, myitem.created_at, myitem.scheme, myitem.text, myitem.attitudes_count,
myitem.comments_count,
myitem.reposts_count,
myitem.source,
myitem.stream_url,
myitem.page_url,
myitem.obj_ext)
cursor.execute(sql)
db.commit()
except MySQLdb.Error as e: # 发生错误时回滚
print "saveMblog--错误%s" % e
print "错误对象%s" % myitem.text
db.rollback()
def saveUset(self, myitem=weibobase.wb_uset):
cursor = db.cursor()
try:
sql = "INSERT INTO weibo_user(user_id,\
avatar_hd, description, screen_name, profile_url,followers_count,follow_count) \
VALUES ('%s', '%s', '%s', '%s', '%s','%s', '%s')" % \
(myitem.id, myitem.avatar_hd, myitem.description, myitem.screen_name, myitem.profile_url,
myitem.followers_count,
myitem.follow_count,
)
cursor.execute(sql)
db.commit()
except MySQLdb.Error as e: # 发生错误时回滚
print "saveUset--错误%s" % e
print "错误对象%s" % myitem.profile_url
db.rollback()
def selectMblog(self, myitem=weibobase.wb_mblog):
cursor = db.cursor()
try:
sql_basic = "SELECT * FROM weibo_mblog \
WHERE mblog_id ='%s'" % (myitem.id)
basic = cursor.execute(sql_basic)
myitem.text = myitem.text.replace("'", "")
dr = re.compile(r'<[^>]+>', re.S)#过滤
dd = dr.sub('', myitem.text)
myitem.text = dd
try:
# python UCS-4 build的处理方式
highpoints = myitem.text.compile(u'[\U00010000-\U0010ffff]')
highpoints = highpoints.sub(u'??', "")
except:
# python UCS-2 build的处理方式
highpoints = myitem.text
myitem.text = highpoints
if basic != 0:
sql_updata = "UPDATE weibo_mblog SET created_at='%s', scheme='%s',text='%s',attitudes_count='%s'" \
",comments_count='%s',reposts_count='%s',source='%s',stream_url='%s',page_url='%s',obj_ext='%s' WHERE mblog_id = '%s'" % (
myitem.created_at, myitem.scheme, myitem.text, myitem.attitudes_count,
myitem.comments_count,
myitem.reposts_count,
myitem.source,
myitem.stream_url,
myitem.page_url,
myitem.obj_ext,
myitem.id)
cursor.execute(sql_updata)
db.commit()
else:
saveMblog(self, myitem)
except MySQLdb.Error as e: # 发生错误时回滚
print "selectMblog--错误%s" % e
print "错误对象%s" % myitem.text
db.rollback()
selectImageUrl(myitem)
def selectUset(self, myitem=weibobase.wb_uset):
cursor = db.cursor()
try:
try:
# python UCS-4 build的处理方式
highpoints = myitem.description.compile(u'[\U00010000-\U0010ffff]')
highpoints = highpoints.sub(u'??', "")
except:
# python UCS-2 build的处理方式
highpoints = myitem.description
myitem.description = highpoints
sql_basic = "SELECT * FROM weibo_user \
WHERE user_id ='%s'" % (myitem.id)
basic = cursor.execute(sql_basic)
if basic != 0:
sql_updata = "UPDATE weibo_user SET avatar_hd='%s', description='%s',screen_name='%s',profile_url='%s'" \
",followers_count='%s',follow_count='%s' WHERE user_id = '%s'" % (
myitem.avatar_hd, myitem.description, myitem.screen_name,
myitem.profile_url,
myitem.followers_count, myitem.follow_count, myitem.id)
cursor.execute(sql_updata)
db.commit()
else:
saveUset(self, myitem)
except MySQLdb.Error as e: # 发生错误时回滚
print "selectUset--错误%s" % e
print "错误对象%s" % myitem.profile_url
db.rollback()
# 关闭数据库连接
def selectImageUrl(self, myitem=weibobase.wb_mblog):
cursor = db.cursor()
try:
for imageUrl in myitem.image_urls:
sql_basic = "SELECT * FROM weibo_image_urls \
WHERE image_url ='%s'" % (imageUrl)
basic = cursor.execute(sql_basic)
if basic == 0:
sql = "INSERT INTO weibo_image_urls(image_url,mblog_id) \
VALUES ('%s', '%s')" % \
(imageUrl, myitem.id
)
cursor.execute(sql)
db.commit()
except MySQLdb.Error as e: # 发生错误时回滚
print "selectImageUrl--错误%s" % e
print "错误对象%s" % myitem.text
db.rollback()
def db_cloce(self):
db.close()
接下来重头戏来了,开始获取数据
GetWeibo.py主要是获取数据
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import MySQL
import json
import urllib
import weibobase
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
cookie = {"Apache": "4085444405648.557.1517558859962",
"H5_INDEX": "2",
"H5_INDEX_TITLE": "%E7%A7%8B%E5%86%AC%E6%9A%96%E8%89%B2%E7%B3%BB",
"M_WEIBOCN_PARAMS": "lfid%3D1005052109066367%252Fhome%26luicode%3D20000174%26fid%3D102803%26uicode%3D10000011",
"SCF": "AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6ocdl8HcIvA5NZpk0cm36a0xrCpnFl0ZgfV-Bc5BUAktQ.",
"SSOLoginState": "1520562809",
"SUB": "_2A253pYIoDeRhGeRP61sR9ijPzTuIHXVVaS5grDV6PUJbktAKLRLQkW1NUFPZQRFUxRYf5itrGk6VqEtGIU3izGDT",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt",
"SUHB": "0Elrkzb0Smx-GW",
"WEIBOCN_FROM": "1110006030",
"_T_WM": "46f8072dc2db4752c9f5f1bb610d6934",
"browser": "d2VpYm9mYXhpYW4%3D",
"h5_deviceID ": "da4db009e6ae38320111cc4fbc8d1998",
}
cookie2 = {"ALF": "1522043003",
"M_WEIBOCN_PARAMS": "luicode%3D10000011%26lfid%3D102803%26fid%3D102803%26uicode%3D10000011",
"SCF": "AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6oNht3rRthMvGzFst-DncCt1l6_LYi6h6jCGNO6OtXVDU.",
"SUB": "_2A253lIvWDeRhGeRP61sR9ijPzTuIHXVVdhWerDV6PUJbktANLVTakW1NUFPZQVmJdEJdcebLE3J8mIqAPe4rxEz4",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt",
"SUHB": "0pHAjcQEUb1cye",
"WEIBOCN_FROM": "1110006030",
"_T_WM": "46f8072dc2db4752c9f5f1bb610d6934",
"browser": "d2VpYm9mYXhpYW4%3D",
"h5_deviceID ": "da4db009e6ae38320111cc4fbc8d1998",
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':"gzip, deflate, br",
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'm.weibo.cn',
'Cookie':'browser=d2VpYm9mYXhpYW4%3D; h5_deviceID=da4db009e6ae38320111cc4fbc8d1998; _T_WM=46f8072dc2db4752c9f5f1bb610d6934; ALF=1523154787; SCF=AlPdz7Wu9iu_xwiWfMtd1hBGr6mZqaKtCcidCgPrDl6ocdl8HcIvA5NZpk0cm36a0xrCpnFl0ZgfV-Bc5BUAktQ.; SUB=_2A253pYIoDeRhGeRP61sR9ijPzTuIHXVVaS5grDV6PUJbktAKLRLQkW1NUFPZQRFUxRYf5itrGk6VqEtGIU3izGDT; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MyLbIiX5quKaqF190KSgT5JpX5K-hUgL.Fozpeh.7Soq0SoM2dJLoIEXLxKMLBKML12zLxK-L1hqLB-eLxKqL1-2L1KqLxKnL1h.LBozLxKMLBoeLB.zt; SUHB=0Elrkzb0Smx-GW; SSOLoginState=1520562809; H5_INDEX=2; H5_INDEX_TITLE=%E7%A7%8B%E5%86%AC%E6%9A%96%E8%89%B2%E7%B3%BB; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803%26fid%3D102803%26uicode%3D10000011',
'RA-Sid': 'B781E81A-20150402-024118-ce25e1-ba5345',
'RA-Ver': '3.0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
class GetWieBo(object):
alls_num=0;
def __init__(self):
super(GetWieBo, self).__init__()
print u'数据获取中,CTRL+C 退出程序...'
def request(self,num=0):
try:
proxies = {
"http": "http://120.79.223.130:3128",
}
url = 'https://m.weibo.cn/api/container/getIndex?containerid=102803&since_id=%s'%num
json_str = requests.get(url, cookies=cookie).content
json_base = json.loads(json_str)
except :
print "HTTPError--错误%s"
json_base=[]
time.sleep(1000)
print len(json_base["data"]["cards"])
if len(json_base["data"]["cards"])==0:
time.sleep(1000)
for cards in json_base["data"]["cards"]:
user=weibobase.wb_uset
user.avatar_hd=cards["mblog"]["user"].get("avatar_hd")# 用户头像
user.description=cards["mblog"]["user"].get("description")# 用户简介
user.screen_name=cards["mblog"]["user"].get("screen_name") # 用户名字
user.profile_url=cards["mblog"]["user"].get("profile_url") # 用户主页地址
user.followers_count=cards["mblog"]["user"].get("followers_count") # 用户粉丝数量
user.follow_count=cards["mblog"]["user"].get("follow_count") # 用户关注数量
user.id=cards["mblog"]["user"].get("id")# 用户id
mblog = weibobase.wb_mblog
mblog.id=cards["mblog"].get("id")# 微博id
mblog.scheme=cards.get("scheme") # 单条的地址
text=cards["mblog"].get("text") # 内容
mblog.text=text
mblog.created_at=cards["mblog"].get("created_at") # 内容
mblog.attitudes_count=cards["mblog"].get("attitudes_count") # 点赞
mblog.comments_count=cards["mblog"].get("comments_count") # 评论
mblog.reposts_count=cards["mblog"].get("reposts_count") # 转发
mblog.source=cards["mblog"].get("source")# 终端
mblog.obj_ext=cards["mblog"].get("obj_ext")#播放次数
if not cards["mblog"].get("pics"):
print "不是图片/没有图片"
mblog.image_urls = []
else:
mblog.image_urls=[]
for pics in cards["mblog"].get("pics"):
url=""
if not pics["large"]:
url = pics["url"]#缩略图
else:
url = pics["large"]["url"] #大图
mblog.image_urls.append(url)
if not cards["mblog"].get("page_info"):
print "不是视频"
mblog.stream_url = ""
mblog.page_url = ""
else:
if not cards["mblog"]["page_info"].get("media_info"):
print "未获取到视频地址"
mblog.stream_url=""
mblog.page_url=""
else:
mblog.stream_url = cards["mblog"]["page_info"].get("media_info").get("stream_url") # 视频地址
mblog.page_url = cards["mblog"]["page_info"].get("page_url") # 视频播放地址
MySQL.selectUset(user)
MySQL.selectMblog(mblog)
MySQL.db_cloce
主方法
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import time
import random
from GetWeibo import GetWieBo
if __name__ == '__main__':
bi = GetWieBo()
for num in range(1,203):
time.sleep(random.randint(3,6))
bi.request(num)