# -*- coding: utf-8 -*-
from urllib import request
import urllib
from lxml import etree
import datetime
import time
import random
import os
import pandas as pd
def setSleep(): #设置随眠时间
interval=40 #设置默认间隔时间
## 设置两个邻近URL请求之间的随机休眠时间,防止Be Caught。目前没有模拟登陆
sleeptime_one = random.randint(interval - 30, interval - 10)
# sleeptime_two = random.randint(interval + 10, interval + 30)
# if sleeptime_two % 2 == 0:
# sleeptime = sleeptime_two
# else:
# sleeptime = sleeptime_one
print('sleeping ' + str(sleeptime_one) + ' seconds...')
time.sleep(sleeptime_one)
return 0
def getTimescope(start_day, end_day,CMlist):#开始结束时间,以及传过来的类别
# start_datetime = datetime.datetime.fromtimestamp(time.mktime(time.strptime(times_list[-1], "%Y-%m-%d-%H")))
nowTime = datetime.datetime.strptime(start_day, '%Y-%m-%d %H:%M:%S')
# endtime = datetime.datetime.strptime(end_day, '%Y-%m-%d %H:%M:%S')
priorYear=str(start_day)[0:4]
timePath = rootPath + priorYear #按照年份建立文件夹
try:
os.makedirs(timePath)
except:
print("路径存在不用建立:"+timePath)
fullPath= rootPath + priorYear + "/"+CMlist +".csv" #根据类别建立csv文件
onceCMlist = urllib.parse.urlencode({"kw": CMlist})[3:] # 改变编码 如中医——》%E4%B8%AD%E5%8C%BB 用于处理url 为后面请求网址做准备
print("改变类别编码:" + onceCMlist)
# 递增的时间
delta = datetime.timedelta(hours=1)
while str(nowTime.strftime('%Y-%m-%d %H:%M:%S')) != end_day:
#nowTime#为当前小时
priorTime=nowTime
nowTime =nowTime+ delta #下一个小时
# print(nowTime)
changePriorTime=str(priorTime).replace(' ', '-')[0:13]
changeNowTime = str(nowTime).replace(' ', '-')[0:13]
print(changePriorTime)
print(changeNowTime)
# getWeiboDate(changePriorTime,changeNowTime,CMlist)
nowYear = str(changeNowTime)[0:4]
if priorYear!=nowYear:#如果当前年与之前的不一样,就重新赋值
priorYear=nowYear
timePath=rootPath+priorYear
if not os.path.exists(timePath):
os.makedirs(timePath)
fullPath = rootPath + priorYear + "/" + CMlist + ".csv" # 根据类别建立csv文件
#getDate(nowTime,nextTime)#获取每小时的数据
getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist)#获取微博内容
def chineseMedicineList(): #设置检索关键字列表
MedicineList = ["中医","把脉"]#"中药"
return MedicineList
def getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist):#爬取的内容格式转化
# https://s.weibo.com/weibo?q=%E4%B8%AD%E5%8C%BB&typeall=1&suball=1×cope=custom:2018-04-03-1:2018-09-17-1&Refer=g
# 变为请求
print("----------------------------")
url = "https://s.weibo.com/weibo?q="+onceCMlist+"×cope=custom:"+changePriorTime+":"+changeNowTime+"&Refer=g"
# url='https://s.weibo.com/weibo?q'+str(once)+'&typeall=1&suball=1×cope=custom:2018-04-03-1:2018-09-17-1&page=6'
try: # 这里会报错
res = request.urlopen(url, timeout=12)
html = res.read()#如果格式错误 当前时间再次检索《手动检测过重当前时间开始不会报错》
txt = html.decode('utf-8')
page = etree.HTML(txt)
dls = page.xpath("//div[@mid]") # 使用xpath解析 contributor: @Michael Luo <michael.nove@gmail.com>
# 提取
for dl in dls:
mid = str(dl.attrib.get('mid'))#获取 评论编号
print( "获取编号:"+mid)
# result = page.xpath(
# "//div[@mid=4284473291699585]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']")[
# 0]
try: # 如果含有展开连接执行这个
url = "//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']"
result = page.xpath(url)[0]
except: # 如果没有展开连接执行
url = "//div[@mid=" + str( mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='txt']"
result = page.xpath(url)[0]
evaluationText = result.xpath('string(.)')
evaluationText = str(str(str(evaluationText).replace('收起全文d', '')).replace(" ","")).replace("\n","")#获取评论
print( "这是评论内容:"+str(evaluationText))
urlnickName = page.xpath("//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@nick-name]") # 用于获得名字
nike_name = str(urlnickName[0].attrib.get('nick-name'))
print("nike_name:" + nike_name)
thisistime = page.xpath("//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='from']/a/text()") # 用于获得时间
istime = str(str(thisistime[0]).replace(" ", "")).replace("\n","")
print("发表时间:" + istime) # 因为我的数据中含有很多空格,所以删了空格
'''
把内容写到csv中去
'''
allinfomation = [(
mid, # 编号
nike_name, # 用户名
evaluationText, # 发表内容
istime, # 发表时间
)]
allinfomationToCsv = pd.DataFrame(allinfomation)
allinfomationToCsv.to_csv(fullPath, header=False, index=False, mode='a+',encoding='utf_8_sig')
except:
print("报错了 报异常解决 再来一遍 ~~~哈哈哈哈哈哈")
getWeiboDate(changePriorTime, changeNowTime, fullPath, onceCMlist)
#爬完一个url后停几几秒钟,防止被抓
setSleep()
'''
存放爬取内容的位置路径, root+爬取年份+分类.txt
'''
rootPath="E:/AApaper/weiboData/" #根目录
# timePath="" #根目录下按照年份建立文件夹
# fullPath="" #最终目标路径
chineseMedicineLists=chineseMedicineList()#获取检索列表
for CMlist in chineseMedicineLists:
startime = "2015-05-09 10:00:00" # 设置爬虫开始时间 2015-01-01 20160509
endtime = "2015-07-01 01:00:00" # 设置爬虫结束时间
getTimescope(startime, endtime,CMlist)
print(CMlist)
# startime = "2015-10-20 22:00:00" # 设置爬虫开始时间
# endtime = "2015-10-21 00:00:00" # 设置爬虫结束时间
# getTimescope(startime, endtime)