爬

最新推荐文章于 2022-01-25 11:47:26 发布

youngYellow

最新推荐文章于 2022-01-25 11:47:26 发布

阅读量193

点赞数

分类专栏： python3

本文链接：https://blog.csdn.net/sinat_35645479/article/details/89708895

版权

python3 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-

from urllib import request
import urllib
from lxml import etree
import datetime
import time
import random
import os
import pandas as pd






def setSleep(): #设置随眠时间
    interval=40 #设置默认间隔时间
    ## 设置两个邻近URL请求之间的随机休眠时间，防止Be Caught。目前没有模拟登陆
    sleeptime_one = random.randint(interval - 30, interval - 10)
    # sleeptime_two = random.randint(interval + 10, interval + 30)
    # if sleeptime_two % 2 == 0:
    #     sleeptime = sleeptime_two
    # else:
    #     sleeptime = sleeptime_one
    print('sleeping ' + str(sleeptime_one) + ' seconds...')
    time.sleep(sleeptime_one)
    return 0
def getTimescope(start_day, end_day,CMlist):#开始结束时间，以及传过来的类别
    # start_datetime = datetime.datetime.fromtimestamp(time.mktime(time.strptime(times_list[-1], "%Y-%m-%d-%H")))
    nowTime = datetime.datetime.strptime(start_day, '%Y-%m-%d %H:%M:%S')
    # endtime = datetime.datetime.strptime(end_day, '%Y-%m-%d %H:%M:%S')

    priorYear=str(start_day)[0:4]
    timePath = rootPath + priorYear #按照年份建立文件夹
    try:
        os.makedirs(timePath)
    except:
        print("路径存在不用建立："+timePath)

    fullPath=  rootPath + priorYear + "/"+CMlist +".csv" #根据类别建立csv文件

    onceCMlist = urllib.parse.urlencode({"kw": CMlist})[3:]  # 改变编码 如中医——》%E4%B8%AD%E5%8C%BB  用于处理url 为后面请求网址做准备
    print("改变类别编码：" + onceCMlist)

    # 递增的时间
    delta = datetime.timedelta(hours=1)
    while str(nowTime.strftime('%Y-%m-%d %H:%M:%S')) != end_day:
        #nowTime#为当前小时
        priorTime=nowTime
        nowTime =nowTime+ delta #下一个小时
        # print(nowTime)
        changePriorTime=str(priorTime).replace(' ', '-')[0:13]
        changeNowTime = str(nowTime).replace(' ', '-')[0:13]
        print(changePriorTime)
        print(changeNowTime)
        # getWeiboDate(changePriorTime,changeNowTime,CMlist)
        nowYear = str(changeNowTime)[0:4]
        if priorYear!=nowYear:#如果当前年与之前的不一样，就重新赋值
            priorYear=nowYear
            timePath=rootPath+priorYear
            if not os.path.exists(timePath):
                os.makedirs(timePath)
            fullPath = rootPath + priorYear + "/" + CMlist + ".csv"  # 根据类别建立csv文件
        #getDate(nowTime,nextTime)#获取每小时的数据
        getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist)#获取微博内容

def chineseMedicineList(): #设置检索关键字列表
    MedicineList = ["中医","把脉"]#"中药"
    return MedicineList

def getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist):#爬取的内容格式转化
    # https://s.weibo.com/weibo?q=%E4%B8%AD%E5%8C%BB&typeall=1&suball=1&timescope=custom:2018-04-03-1:2018-09-17-1&Refer=g
    # 变为请求

    print("----------------------------")
    url = "https://s.weibo.com/weibo?q="+onceCMlist+"&timescope=custom:"+changePriorTime+":"+changeNowTime+"&Refer=g"
    # url='https://s.weibo.com/weibo?q'+str(once)+'&typeall=1&suball=1&timescope=custom:2018-04-03-1:2018-09-17-1&page=6'
    try:  # 这里会报错
        res = request.urlopen(url, timeout=12)
        html = res.read()#如果格式错误 当前时间再次检索《手动检测过重当前时间开始不会报错》

        txt = html.decode('utf-8')
        page = etree.HTML(txt)


        dls = page.xpath("//div[@mid]")  # 使用xpath解析 contributor: @Michael Luo <michael.nove@gmail.com>

        # 提取
        for dl in dls:

            mid = str(dl.attrib.get('mid'))#获取 评论编号
            print( "获取编号："+mid)
            # result = page.xpath(
            #     "//div[@mid=4284473291699585]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']")[
            #     0]
            try:  # 如果含有展开连接执行这个
                url = "//div[@mid=" + str(
                    mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']"
                result = page.xpath(url)[0]


            except:  # 如果没有展开连接执行
                url = "//div[@mid=" + str( mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='txt']"
                result = page.xpath(url)[0]

            evaluationText = result.xpath('string(.)')
            evaluationText = str(str(str(evaluationText).replace('收起全文d', '')).replace(" ","")).replace("\n","")#获取评论
            print(  "这是评论内容:"+str(evaluationText))
            urlnickName = page.xpath("//div[@mid=" + str(
                mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@nick-name]")  # 用于获得名字
            nike_name = str(urlnickName[0].attrib.get('nick-name'))
            print("nike_name:" + nike_name)

            thisistime = page.xpath("//div[@mid=" + str(
                mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='from']/a/text()")  # 用于获得时间
            istime = str(str(thisistime[0]).replace(" ", "")).replace("\n","")
            print("发表时间:" + istime)  # 因为我的数据中含有很多空格，所以删了空格
            '''
            把内容写到csv中去
            '''
            allinfomation = [(
                mid,  # 编号
                nike_name,  # 用户名
                evaluationText,  # 发表内容
                istime,  # 发表时间
            )]
            allinfomationToCsv = pd.DataFrame(allinfomation)
            allinfomationToCsv.to_csv(fullPath, header=False, index=False, mode='a+',encoding='utf_8_sig')
    except:
        print("报错了 报异常解决 再来一遍 ~~~哈哈哈哈哈哈")
        getWeiboDate(changePriorTime, changeNowTime, fullPath, onceCMlist)

    #爬完一个url后停几几秒钟，防止被抓
    setSleep()



'''
存放爬取内容的位置路径， root+爬取年份+分类.txt
'''
rootPath="E:/AApaper/weiboData/" #根目录  
# timePath="" #根目录下按照年份建立文件夹
# fullPath="" #最终目标路径

chineseMedicineLists=chineseMedicineList()#获取检索列表
for CMlist in chineseMedicineLists:
    startime = "2015-05-09 10:00:00"  # 设置爬虫开始时间  2015-01-01    20160509
    endtime = "2015-07-01 01:00:00"  # 设置爬虫结束时间
    getTimescope(startime, endtime,CMlist)
    print(CMlist)


# startime = "2015-10-20 22:00:00"  # 设置爬虫开始时间
# endtime = "2015-10-21 00:00:00"  # 设置爬虫结束时间
# getTimescope(startime, endtime)

youngYellow

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬

# -*- coding: utf-8 -*-from urllib import requestimport urllibfrom lxml import etreeimport datetimeimport timeimport randomimport osimport pandas as pddef setSleep(): #设置随眠时间 inte...
复制链接

扫一扫