python爬取微博数据存入数据库_Python爬取微博数据并存入mysql,excel中

寒假期间做微博数据统计,由于是部门要求,也没办法,自己又是一个懒人,直接用python实现吧。写的很粗糙,也很差,请大家别介意。

总的来说分成两个模块:一个爬取微博数据存入数据库,一个从数据库调取数据放入excel表格中。还有一个config文件,由于每周值班人员几乎都在变动以及日期也都是从周一到周日,由于某些原因不能做实时数据统计,所以需要config文件设置代码中的部分情况。注意我在mysql中是提前建立表的,而我放假回来后,莫名其妙mysql某个配置文件丢失,导致我花了很久才重装了,数据也都丢失了。

平心而论,代码很简单,但值得注意的是要关注爬取数据中的不同点:

微博数据客户端与手机端获取数据有差,有部分数据丢失。

import requests

import time

import pymysql

import re

from pyquery import PyQuery as py

db = pymysql.connect('localhost', 'root', '123456', 'hfut_weibo',use_unicode=True, charset="utf8")

cursor = db.cursor()

cursor.execute('SET NAMES utf8;')

cursor.execute('SET CHARACTER SET utf8;')

cursor.execute('SET character_set_connection=utf8;')

headers = {'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36'}

def mysql(date,attitudes_count,comments_count, reposts_count, index,title, data):

data = transferContent(data)

title = transferContent(title)

comtent = (

'#冰糖说英语#', '#冰糖点歌台#', '#冰糖小百科#', '#冰糖的树洞#', '#冰糖么么答#', '#冰糖赠书#', '#冰糖书影音#', '#老照片#', '#直播#',

'#送福利#')

for com in comtent:

parttern = re.compile(com)

match = re.search(parttern,title)

if match:

title = match.group()

print(data)

sql = '''

INSERT INTO inital_weibo_data_special VALUES ("%s",'%d','%d','%d','%f',"%s","%s")

''' % (date, attitudes_count, comments_count, reposts_count, index, title, data)

cursor.execute(sql)

db.commit()

return

sql = '''

INSERT INTO inital_weibo_data VALUES ('%s',"%d","%d",'%d','%f',"%s","%s")

''' % (date, attitudes_count, comments_count, reposts_count, index, title, data)

cursor.execute(sql)

db.commit()

return

def transferContent( content):

string = ""

for c in content:

if c == '"':

string += '\\\"'

elif c == "'":

string += "\\\'"

else:

string += c

return string

def error_com(data):

# comtent = ('#冰糖说英语#','#冰糖点歌台#','#冰糖小百科#','#冰糖的树洞#','#冰糖么么答#','#冰糖赠书#','#冰糖书影音#','#老照片#','#直播#','#送福利#')

# # for com in comtent:

# for com in comtent:

# parttern = re.compile(com)

# martch = re.search(parttern,data)

#

# if martch:

# print(martch.string)

# return martch.string

doc = py(data)

return doc.find('a').text()

def data_store():

Q = 0

for i in range(1,30):

time.sleep(0.5)

url = "https://m.weibo.cn/api/container/getIndex?uid=2348668214&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%90%88%E8%82%A5%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6&featurecode=20000320&type=uid&value=2348668214&containerid=1076032348668214&page={num}".format(num = i)

r_1 = requests.get(

url,

headers=headers)

try:

if i == 1:

for j in range(2, 11):

time.sleep(0.05)

page_1 = r_1.json()['data']['cards'][j]['mblog']

date = page_1['created_at']

attitudes_count = page_1['attitudes_count']

comments_count = page_1['comments_count']

reposts_count = page_1['reposts_count']

data = page_1['text']

index = attitudes_count + comments_count*0.5 + reposts_count*2

doc = py(data)

title = doc.find('a').text()

print(page_1['text'])

print(page_1['attitudes_count'])

print(page_1['comments_count'])

print(page_1['reposts_count'])

print(index)

print(page_1['created_at'])

print(title)

try:

mysql(date, attitudes_count, comments_count, reposts_count, index, title, data)

except:

# title = page_1['page_info']['page_title']

# mysql(date, attitudes_count, comments_count, reposts_count, index, title)

# print('本次插入错误,继续下一次')

data = title

mysql(date, attitudes_count, comments_count, reposts_count, index, title, data)

continue

else:

print(r_1.json()['data']['cards'])

for j in range(0,10):

print(url)

time.sleep(0.05)

page_1 = r_1.json()['data']['cards'][j]['mblog']

date = page_1['created_at']

attitudes_count = page_1['attitudes_count']

comments_count = page_1['comments_count']

reposts_count = page_1['reposts_count']

data = page_1['text']

index = attitudes_count + comments_count * 0.5 + reposts_count * 2

doc = py(data)

title = doc.find('a').text()

print(j)

print(page_1['text'])

print(page_1['attitudes_count'])

print(page_1['comments_count'])

print(page_1['reposts_count'])

print(index)

print(page_1['created_at'])

try:

mysql(date, attitudes_count, comments_count, reposts_count, index, title, data)

except:

print(11111111111111111111111111111111111111111111111111111111111111111111111111111111)

# title = page_1['page_info']['page_title']

# mysql(date, attitudes_count, comments_count, reposts_count, index, title)

data = title

mysql(date, attitudes_count, comments_count, reposts_count, index, title, data)

continue

except:

Q = Q +1

continue

print(Q)

def main():

data_store()

if __name__ == '__main__':

main()

db.close()

import xlsxwriter

import pymysql

##################################################################

from config_155 import *

##################################################################

db = pymysql.connect('localhost', 'root', '123456', 'hfut_weibo',use_unicode=True, charset="utf8")

cursor = db.cursor()

cursor.execute('SET NAMES utf8;')

cursor.execute('SET CHARACTER SET utf8;')

cursor.execute('SET character_set_connection=utf8;')

workbook = xlsxwriter.Workbook("E:\合肥工业大学微博值班数据\%s.xlsx" % document_name)

worksheet1 = workbook.add_worksheet("日常值班")

worksheet1.write(0, 0,'日期')

worksheet1.write(0, 1, '值班人')

worksheet1.write(0, 2, '转发数')

worksheet1.write(0, 3, '评论数')

worksheet1.write(0, 4, '点赞数')

worksheet1.write(0, 5, '指标数')

worksheet2 = workbook.add_worksheet("特色栏目")

worksheet2.write(0, 0,'栏目')

worksheet2.write(0, 1, '负责人')

worksheet2.write(0, 2, '转发数')

worksheet2.write(0, 3, '评论数')

worksheet2.write(0, 4, '点赞数')

worksheet2.write(0, 5, '指标数')

def mysql_read(time,name,i,sig):

if sig == 1:

sql='''

SELECT * FROM inital_weibo_data WHERE date='%s'

''' % time

else:

sql = '''

SELECT * FROM inital_weibo_data_special WHERE title='%s' AND date BETWEEN '%s' AND '%s'

''' % (time,timt_start,time_conclude)

return sql

def excel_work(sql,i,time,name,worksheet):

try:

# 执行SQL语句

cursor.execute(sql)

# 获取所有记录列表

results = cursor.fetchall()

print(results)

worksheet.write(i, 0, time)

worksheet.write(i, 1, name)

index_all = 0

attitudes_count_all = 0

comments_count_all = 0

reposts_count_all = 0

for row in results:

date = row[0]

attitudes_count = row[1]

comments_count = row[2]

reposts_count = row[3]

index = row[4]

index_all += index

attitudes_count_all += attitudes_count

comments_count_all += comments_count

reposts_count_all += reposts_count

worksheet.write(i, 2, reposts_count)

worksheet.write(i, 3, comments_count)

worksheet.write(i, 4, attitudes_count)

worksheet.write(i, 5, index)

i = i + 1

# 打印结果

print(date, attitudes_count, comments_count, reposts_count, index)

worksheet.write(i, 2, reposts_count_all)

worksheet.write(i, 3, comments_count_all)

worksheet.write(i, 4, attitudes_count_all)

worksheet.write(i, 5, index_all)

return i + 2

except:

print("Error: unable to fecth data")

def main():

i = 1

for (k, v) in work_1.items():

time = k

name = v

sql = mysql_read(time, name, i, 1)

i = excel_work(sql,i,time,name,worksheet1)

i=1

for (k, v) in work_0.items():

time = k

name = v

sql = mysql_read(time, name, i, 0)

i = excel_work(sql, i, time, name, worksheet2)

if __name__ == '__main__':

main()

db.close()

workbook.close()

work = {'星期一':'张壮','星期二':'王修齐','星期三':'徐露露','星期四':'张雪辰','星期五':'孙艳丽','星期六':'马玉龙','星期日':'陈之羽',}

work_0 = \

{'#冰糖说英语#':'彭二丹','#冰糖点歌台#':'董令','#冰糖小百科#':'周成旭','#冰糖的树洞#':'张雪辰','#冰糖么么答#':'',\

'#冰糖赠书#':'蒋媛媛','#冰糖书影音#':'云曈','#老照片#':'彭二丹','#直播#':'蒋媛媛','#送福利#':'彭二丹','#冰糖说英语#':'彭二丹',}

work_1 = {'01-08':'张壮','01-09':'王修齐','01-10':'徐露露','01-11':'张雪辰','01-12':'孙艳丽','01-13':'马玉龙','01-14':'陈之羽',}

document_name = '150周(第十九教学周)数据统计(2018.1.08-2018.1.14)'

timt_start = '01-08'

time_conclude = '01-14'

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值