python(pycharm) 爬取微博内容
通过关键字,爬取微博内容,微博内容,时间,链接等
例如:(文本有些折叠了)
关键词:台风
代码如下:
from selenium import webdriver
from lxml import etree
from urllib import parse
from time import sleep
import datetime
from xlutils.copy import copy
import xlrd
import time
keyword = '台风' # 爬取的关键词
y = 2020 # 起始年
m = 3 # 起始月
d = 10 # 起始日
days = 20 # 爬days天
url_keyword = parse.quote(keyword) # 将关键词转换成为网址可识别
def getday(y, m, d, n): # 封装日期
the_date = datetime.datetime(y, m, d)
result_date = the_date + datetime.timedelta(days=n)
d = result_date.strftime('%Y-%m-%d')
return d
def p(days, x): # 爬取解析存储
for i in range(days):
data = getday(y, m, d, +i)
for j in range(24): # 获取24小时的网址
if j == 23:
data_add_hour = data + '-' + str(j) + ':' + getday(y, m, d, -(i - 1)) + '-' + str(0)
else:
data_add_hour = data + '-' + str(j) + ':' + data + '-' + str(j + 1)
# selenium
bro = webdriver.Chrome(executable_path=r'D:\python\chorm\chromedriver.exe')
url = 'https://s.weibo.com/weibo?q=' + url_keyword + '&typeall=1&suball=1×cope=custom:' + data_add_hour
print(url)
bro.get(url)
sleep(2) # 等待完整加载
page_text = bro.page_source # 完整页面
sleep(2)
bro.quit() # 关闭网页
# 开始解析
tree = etree.HTML(page_text)
print(tree)
wb_list = tree.xpath("//div[@class='card-feed']")
# # wb_list = tree.xpath(".// *[ @ id = 'pl_feedlist_index'] //div[@class='card-feed']")
# # // *[ @ id = "pl_feedlist_index"] / div[2] / div[3] / div / div[1]
# wb_list = tree.xpath("// *[ @ id = 'pl_feedlist_index'] / div[2] / div[3] / div / div[1]")
wb_time = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[1]/div/div[1]/div[2]/p[2]/a[1]/text()")
wb_name = tree.xpath(
".//*[@id='pl_feedlist_index']/div[2]/div[2]/div/div[1]/div[2]/div[1]/div[2]/a[1]/text()")
wb_text = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[2]/div/div[1]/div[2]/p[1]//text() ")
wb_from = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[5]/div/div[1]/div[2]/p[3]/a[2]/text()")
wb_href = tree.xpath(".//*[@id='pl_feedlist_index']/div[2]/div[1]/div/div[1]/div[2]/p[2]/a[1]/@href")
# print(wb_href)
rb = xlrd.open_workbook('wb.xls') # 打开文件
wb = copy(rb) # 利用xlutils.copy下的copy函数复制
ws = wb.get_sheet(0) # 获取表单0
ws.write(x, 1, wb_name)
print(wb_name)
ws.write(x, 2, wb_href)
print(wb_href)
ws.write(x, 3, wb_text)
print(wb_text)
ws.write(x, 4, wb_time)
print(wb_time)
ws.write(x, 5, wb_from)
print(wb_from)
x = x + 1
print(x)
wb.save('wb.xls') # 保存文件
if __name__ == '__main__':
p(days, 1)
有几个问题还没完善
使用selenium太慢了(考虑多线程同时
获取的文本和时间有多余空格(正则
可以在某个时间没有微博爬到空的(添加一个判断