comment_info
爬取的东西是直接往数据库存的,数据库基本操作建表
CREATE TABLE comment_info_update
(
comment_url VARCHAR(200),
comment_title VARCHAR(200),
comment_ LONGTEXT,
comment_score VARCHAR(50),
comment_date VARCHAR(100),
user_name VARCHAR(100),
uid VARCHAR(100)
)
下面是爬虫代码
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup# 用于# 从HTML或XML文件中提取数据的Pyt
import random
import time
import pymysql
from time import sleep
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/201002201 Firefox/55.0',
'Accept': 'textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': '',
'Connection': 'keep-alive',
'Pragma': 'no-cache'
#'Cache-Control': 'no-cache '
}
db = pymysql.connect("localhost", "root","root", "travel_db")
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# SQL 查询语句
ip = "SELECT ip FROM ip_list;"
cursor.execute(ip)
results2 = cursor.fetchall()
def get_random_ip():
ip_list =results2
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + str(ip))
proxy_ip = random.choice(proxy_list)
proxies = {
'http': proxy_ip}
return proxies
sql0 = "SELECT Url FROM sights_1 where flag='0';"
datalst = []
# 执行SQL语句
cursor.execute(sql0)
# 获取所有记录列表
results = cursor.fetchall()
i = 0
for row in results:
url = row[0]
value = url
dic = {
}
proxies = get_random_ip()
try:
ri = requests.get(url)
sql1 = "UPDATE sights_1 SET flag=-1 where Url='%s'" % (value)
cursor.execute(sql1)
db.commit()
sleeptime = random.randint(2, 10)
time.sleep(sleeptime)
soupi = BeautifulSoup(ri.text, 'lxml')
comments = soupi.find('span', attrs={
'class': 'e_nav_comet_num'}).text
comments = re.findall(r"\d+", comments)
for m in comments:
j = int(m, base=10)
print