python pycharm采集头条文章和对应评论

一、PyCharm导入selenium,参考下面文章

https://blog.csdn.net/u010381752/article/details/98955424

二、引入:

from selenium import  webdriver

三、初始驱动


dr = 0
# 登录网易邮箱
def init_driver():
    # 新建Chrome览器驱动
    global dr
    chrome_driver = r'venv\Scripts\chromedriver.exe'
    # dr = webdriver.Chrome()
    dr = webdriver.Chrome(executable_path=chrome_driver)
    return

四、采文章

# 取出文章内容并写入数据库,并返回文章ID用于添加相关评论
def AddArticleToData(resultDict):
    url = resultDict['url']
    cate_id = resultDict['cate_id']
    param_cookie = resultDict['param_cookie']
    # 提取aid
    start = url.find("group/")
    end = url.find("/", start + 7)
    aid = '';
    aid = url[start + 6:end]
    if aid.strip() == '':
        resultDict['err'] = "aid null"
        return False


  
    # 重新组装url
    time_stamp = int(time.time())
    url = "https://www.toutiao.com/a%s/?timestamp=%s&app=news_article&group_id=%s" % (aid,time_stamp,aid)
   
    dr.get(url)
    str_html = dr.page_source
    searchObj = re.search(r'var BASE_DATA = ({[\s\S]*?});', str_html, re.M | re.I)
    groupsTemp = searchObj.groups() #groups从1开始
    lenTemp = len(groupsTemp)
    if ( lenTemp <= 0):
        resultDict['err'] = "匹配文章时出错"
        return False
    content_temp = searchObj.group(1)
    # print(contentTemp1)
    # 转成json对象

    # jsonText = json.loads(str_json)
    content_temp = re.sub(r'\.\w+?\(.*?\)', '', content_temp)# 要替换掉.slice(6, -6)
    json_text = demjson.decode(content_temp)



    # 检测是否有相同aid
    if IsTitleRepeat(aid):
        resultDict['err'] = "txt:文章重复了哦!"
        return False

    # 检测是否有相同标题

    news_title = json_text["articleInfo"]["title"].strip('&quot;')
    news_title = html.unescape(news_title)
    if IsTitleRepeatInDb(news_title):
        resultDict['err'] = "db:文章重复了哦!"
        return False
    news_content = json_text["articleInfo"]["content"].strip('&quot;')
    news_content = html.unescape(news_content)


    # 在内容中取出图片地址下载
    src_imgs = re.findall(r'<img src="(.*?)".*?>', news_content)
    # if src_imgs:
    #     for image in src_imgs:
    #         print(image)

    focus_img = []
    for i in range(len(src_imgs)):
        imgUrl = src_imgs[i]
        imgUrl = imgUrl + '.jpg'
        # 下载图片并返回本地图片地址
        img_loc = downloadIma(imgUrl)
        img_loc = img_loc.replace('\\', '/')
        img_loc = img_loc.replace(r'./', r'/')
        # if i==0:
        img_temp = img_loc
        img_temp = img_temp.replace('/data/attachment/tomwx', '')
        focus_img.append(img_temp)  # 去掉前面 的/[1:] 列表页不需要前缀/data/attachment/tomwx
        # imgUrl = '<p style="margin: 15px 0;text-align: center;"><img src="%s"></p>' % imgUrl
        imgUrl = imgUrl.replace(r'.jpg', '')
        news_content = news_content.replace(imgUrl, img_loc)

    news_content = news_content.replace('<p>', '<p style="margin: 25px 0;">')
        # news_content = news_content.replace('<p><img', '<p style="margin: 15px 0;text-align: center;"><img')
    news_content = news_content.replace('<img', '<img style="display:block;margin: 15px auto;"')
    
    # 插入到数据

    # 获取游标
    global conn
    cursor = conn.cursor()
    # 2数据库中插入数据
    time_stamp = int(time.time())
    virtrue_click = random.randint(35,178)
    sql_insert = "INSERT INTO ..."
    # 执行语句
    cursor.execute(sql_insert)
    lastRowId = cursor.lastrowid
    # 事务提交,否则数据库得不到更新
    conn.commit()
    # print(cursor.rowcount)

    # 更新主图 可插入多张 要么1张 要么3张
    temp_list = []
    for i in range(len(focus_img)):
        temp_list_sub = (lastRowId, '1', focus_img[i], '', '1', None, '0', None, '100000', time_stamp, None, None, None)
        temp_list.append(temp_list_sub)
        if i >= 2:
            break
 
    sql_insert = "INSERT INTO ..."
    # 执行语句
    # cursor.execute(sql_insert)
    cursor.executemany(sql_insert, temp_list)
    conn.commit()

    # conn.close()
    cursor.close()

    # 记录aid,写入txt
    filehand = open('data/newsposted.txt', 'a')
    filehand.write('%s\n' % aid)
    filehand.close()

    resultDict['lastRowId'] = lastRowId
    return True

五、采集评论

# GetComments 获取评论列表
def AddCommentToData(aid,article_id):

    global conn
    # 获取游标
    cursor = conn.cursor()

    time_stamp = int(time.time() * 1000)
    xss_item_id = 13582543483546817534
    xss_item_id = aid
    randCount = random.randint(8, 15)

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
    }
    count_max = 10
    offset_temp = 0
    db_num = randCount / count_max
    int_num = int(randCount / count_max)
    loop_count = int_num
    if db_num > int_num:
        int_num += 1
        loop_count = int_num
    if loop_count<=0:
        loop_count = 1
    # 循环读取loop_count次。合并后,再取指定次数
    comments = []
    for loop_i in range(loop_count):
        offset_temp = loop_i * count_max
        source_url = 'https://www.toutiao.com/article/v2/tab_comments/?app_name=toutiao-web&group_id=%s&item_id=%s&count=%s&offset=%s' % (
        aid,aid, count_max, offset_temp)
        resp = requests.get(source_url, headers=headers).json()
        comments.extend(resp['data'])
    if randCount>= len(comments):
        randCount = len(comments)
    comments = comments[:randCount]
    comments.reverse()
    strResult = ''
    for comment in comments:
        data = comment
        # 评论内容text
        text = data['comment']['text']
        # 过滤emoji字符
        text = emoji.demojize(text)
        text = re.sub(r':(.*?):', '', text).strip() #riddof special char
        text = re.sub(r'\[.{,8}\]', '', text)  # riddof [表情] code

        # 2数据库中插入数据
        time_stamp = int(time.time())
        # 马甲用户:id范围 6 - 226
        # majiaid = random.randint(6,217)
        majiaid = random.randint(0, len(list_majia))
        majiaid = list_majia[majiaid][0]
        sql_insert = "INSERT INTO `...."
        # 执行语句
        cursor.execute(sql_insert)
        lastRowId = cursor.lastrowid
        # 事务提交,否则数据库得不到更新
        conn.commit()
        # # 插入点赞
        #...


    cursor.close()
    return True

文章和评论就采集好了。以上是必要代码,仅参考!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值