python爬虫爬取去哪儿网评论详情

comment_info

爬取的东西是直接往数据库存的,数据库基本操作建表

CREATE TABLE comment_info_update
(
comment_url VARCHAR(200),
comment_title VARCHAR(200),
comment_  LONGTEXT,
comment_score VARCHAR(50),
comment_date VARCHAR(100),
user_name VARCHAR(100),
uid VARCHAR(100)
)

下面是爬虫代码

import pandas as pd
import requests
import re
from bs4 import BeautifulSoup# 用于# 从HTML或XML文件中提取数据的Pyt
import random
import time
import pymysql
from time import sleep

headers = {
   
   'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/201002201 Firefox/55.0',
   'Accept': 'textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
   'Accept-Encoding': 'gzip, deflate, br',
   'Cookie': '',
   'Connection': 'keep-alive',
   'Pragma': 'no-cache'
   #'Cache-Control': 'no-cache '
}


db = pymysql.connect("localhost", "root","root", "travel_db")
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 查询语句
ip = "SELECT ip FROM ip_list;"
cursor.execute(ip)
results2 = cursor.fetchall()


def get_random_ip():
    ip_list =results2
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + str(ip))
    proxy_ip = random.choice(proxy_list)
    proxies = {
   'http': proxy_ip}
    return proxies


sql0 = "SELECT Url FROM sights_1 where flag='0';"
datalst = []
# 执行SQL语句
cursor.execute(sql0)
# 获取所有记录列表
results = cursor.fetchall()
i = 0
for row in results:
    url = row[0]
    value = url
    dic = {
   }
    proxies = get_random_ip()

    try:
        ri = requests.get(url)

        sql1 = "UPDATE sights_1 SET flag=-1 where Url='%s'" % (value)
        cursor.execute(sql1)
        db.commit()

        sleeptime = random.randint(2, 10)
        time.sleep(sleeptime)

        soupi = BeautifulSoup(ri.text, 'lxml')
        comments = soupi.find('span', attrs={
   'class': 'e_nav_comet_num'}).text
        comments = re.findall(r"\d+", comments)
        for m in comments:
            j = int(m, base=10)
        print
  • 3
    点赞
  • 28
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值