1、 概述
本博客纯属原创,如有转载,请注明作者
运行环境:python3.5
所需模块:bs4 ,queue.thread,pymysql,requests,大家如果想运行此代码,只需要将我标粗的部分修改即可。
2、具体内容
2、1导入具体模块
###导入具体模块
import requests
from bs4 import BeautifulSoup
import re
from collections import deque
import sys
import numpy as np
import jieba
import threading
from threading import current_thread,Lock
from time import ctime ,sleep
import pymysql
import json
import urllib
import math
import queue
2、2定义线程类
class MyThread(threading.Thread):
def __init__(self, funcs, args, name=''):
threading.Thread.__init__(self)
self.funcs = funcs
self.name = name
self.args = args
def run(self):
self.funcs(*self.args)
2、3接下来就是重点了
###接下来就是爬取网页了
def getContent(que):
while que:
try:
url = que.popleft()
print('正在爬的线程是'+current_thread().name+"爬的是"+url)
**headers** = {
'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - Hans - CN, zh - Hans;q = 0.5',
'User - Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393',
'Connection': 'Keep - Alive'
}
req = requests.get(url, headers=headers)
req.encoding = 'gbk'
res = req.text
bs = BeautifulSoup(res)
bs = bs.find_all('div', class_='i-item')
total = []
for i in bs:
reg1 = i.find('dd')
reg2 = i.find('span', class_=re.compile(r'^sta'))
scores = reg2.get('class')
scores = ''.join(scores)
scores = str(scores)
if scores == 'starsa5' or scores == 'starsa4' or scores == 'starsa3':
j = 1
#
# j=1
#
if scores == 'starsa2' or scores == 'starsa1' or scores == 'starsa0':
j = 0
reg3 = re.compile("<[^>]*>")
content = reg3.sub('', reg1.prettify())
print(content)
total.append((content, scores, j))
**db = pymysql.connect('localhost', 'root', '**********', 'test')**
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO newjd (comment,scores ) VALUES ('%s','%s') "
sql2 = "INSERT INTO test1 (title) VALUES ('%s') "
cursor.execute(sql % (content, j))
db.commit()
cursor.close()
db.close()
sleep(3)
except Exception:
print('运行出错')
2、4运行
###在这里我用了四个线程
def main():
que = deque()
visited = set()
id = [549056]####可自行选择商品id
commentpeypage = 30
for i in id:
itemsummaryurl = 'http://club.jd.com/ProductPageService.aspx?method=GetCommentSummaryBySkuId&referenceId=' + str(
i)
itemsummaryresponse = urllib.request.urlopen(itemsummaryurl)
itemsummaryjson_dict = json.loads(itemsummaryresponse.read().decode('utf-8'))
commentrange = int(math.ceil(itemsummaryjson_dict.get('CommentCount')) / commentpeypage)
for j in range(commentrange):
url = 'http://club.jd.com/review/' + str(i) + '-0-' + str(j) + '-0.html'
que.append(url)
thread=[]
for i in range(4):
t = MyThread(getContent, (que, ), name='thread' + str(i))
thread.append(t)
for i in range(4):
thread[i].start()
for i in range(4):
thread[i].join()
if __name__ =='__main__':
main()