环境准备 已有环境可以忽略
python3 和 pip3 安装
配置驱动的环境变量,或者将驱动放到已经配置好的文件夹中, 类似 window 的 cmd的目录
window :C:\Windows\System32 linux : /usr/bin; /usr/local/bin
代码
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 24 12:07:25 2019
@author: icheng
"""
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from pyquery import PyQuery as pq
import time
import pymysql
import threading
class Spider:
def __init(self):
self.__data = []
self.__options = Options()
self.__options.add_argument('-headless')
self.__driver = Firefox(options=self.__options)
print('init')
# 退出无头浏览器
def __close(self):
self.__driver.quit()
print("close")
# 通过url解析页面,能加载js
def pageParsing(self,url):
try:
self.__driver.get(url)
time.sleep(1)
html = self.__driver.page_source
except ConnectionError:
print('ConnectionError')
print('Attempting to reconnect')
time_sleep = 0
while True:
time.sleep(time_sleep)
try:
self.__driver.get(url)
html = self.__driver.page_source
break
except ConnectionError:
time_sleep = time_sleep * 2
if(time_sleep > 50):
print('pageParsing wrong exit')
break
return html
# 处理单个
def getInfo(self,url):
html = self.pageParsing(url)
doc = pq(html)
text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
trynumber = 0
while(text == '' and trynumber < 5):
html = self.pageParsing(url)
doc = pq(html)
text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
trynumber += 1
if(trynumber == 5):
print('geting',url.split('/')[-1])
return -1
number = int(text[2:-3]) #从 ‘已有###人参加’ 字符串中提取数字
return number
def getData(self):
return self.__data
# 批量处理
def getBatchData(self,urls):
self.__init()
for url in urls:
number = self.getInfo(url[0])
if(number == -1):
continue
self.__data.append([number,url[1]])
self.__close()
# 主进程 分成多个进程来爬取 加快速度
def run(urls):
threadnumber = 10
num = len(urls)
count = num // threadnumber
threads = [] # 用来存放线程的列表
spiders = []
for i in range(threadnumber): # 新建进程
spider = Spider()
t = threading.Thread(target=spider.getBatchData, args=(urls[count * i:count * (i + 1)],))
threads.append(t)
spiders.append(spider)
time.sleep(1)
if(count * (threadnumber) < num):
spider = Spider()
t = threading.Thread(target=spider.getBatchData, args=(urls[count * threadnumber:num],))
threads.append(t)
spiders.append(spider)
for t in threads: # 开启所有线程
t.start()
for t in threads: # 阻塞主线程,直到所有线程全部完成
t.join()
data = []
for s in spiders:
data += s.getData()
return data
def update():
times = time.time()
# 连接数据库 将数据库配置更改为运行机器的数据库信息
db = pymysql.connect(host='localhost',user='root',password='123456',database='world')
cursor = db.cursor()
# 更改sql语句 只需要 ID url
select = 'select id,url from c'
cursor.execute(select) # 执行语句
urls = []
for i in cursor:
urls.append([i[1],i[0]])
print('running...') # 爬取数据
data = run(urls)
data = []
print('Data Acquisition Success')
# 更改更新的sql语句
sql = 'update c set number=%s where id=%s'
cursor.executemany(sql,data) # 更新到数据库
db.commit()
cursor.close()
db.close()
print('Successful database update')
times2 = time.time()
print(times2-times)
if __name__ == '__main__':
update()