python 数据库爬虫

最新推荐文章于 2024-02-22 14:51:21 发布

程序员leos

最新推荐文章于 2024-02-22 14:51:21 发布

阅读量194

点赞数

本文链接：https://blog.csdn.net/huicheng_chen/article/details/97526378

版权

环境准备已有环境可以忽略

python3 和 pip3 安装

安装 selenium

下载Firefox驱动

配置驱动的环境变量，或者将驱动放到已经配置好的文件夹中，类似 window 的 cmd的目录

window ：C:\Windows\System32 linux ： /usr/bin； /usr/local/bin

安装 pyquery

安装pymysql

代码

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 24 12:07:25 2019

@author: icheng
"""
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from pyquery import PyQuery as pq
import time
import pymysql
import threading

class Spider: 
    def __init(self):
        self.__data = []
        self.__options = Options()
        self.__options.add_argument('-headless')
        self.__driver = Firefox(options=self.__options) 
        print('init')
        
    # 退出无头浏览器    
    def __close(self):  
        self.__driver.quit()
        print("close")
        
    # 通过url解析页面，能加载js
    def pageParsing(self,url):  
        try:    
            self.__driver.get(url)
            time.sleep(1)
            html = self.__driver.page_source
        except ConnectionError:
            print('ConnectionError')
            print('Attempting to reconnect')
            time_sleep = 0
            while True:
                time.sleep(time_sleep)
                try:
                    self.__driver.get(url)
                    html = self.__driver.page_source
                    break
                except ConnectionError:
                    time_sleep = time_sleep * 2
                    if(time_sleep > 50):
                        print('pageParsing wrong exit')
                        break
        return html
    
    # 处理单个
    def getInfo(self,url):      
        html = self.pageParsing(url)
        doc = pq(html)
        text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
        trynumber = 0
        while(text == '' and trynumber < 5):
            html = self.pageParsing(url)
            doc = pq(html)
            text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
            trynumber += 1
        if(trynumber == 5):
            print('geting',url.split('/')[-1])
            return -1
        number = int(text[2:-3])           #从 ‘已有###人参加’  字符串中提取数字
        return number
    
    def getData(self):
        return self.__data
    
    # 批量处理    
    def getBatchData(self,urls):
        self.__init()
        for url in urls:
            number = self.getInfo(url[0])
            if(number == -1):
                continue
            self.__data.append([number,url[1]])
        self.__close()

 # 主进程 分成多个进程来爬取  加快速度                   
def run(urls):
    threadnumber = 10
    num = len(urls)
    count = num // threadnumber
    threads = []        # 用来存放线程的列表
    spiders = []
    for i in range(threadnumber):   # 新建进程
        spider = Spider()
        t = threading.Thread(target=spider.getBatchData, args=(urls[count * i:count * (i + 1)],))
        threads.append(t)
        spiders.append(spider)
        time.sleep(1)
    if(count * (threadnumber) < num):
        spider = Spider()
        t = threading.Thread(target=spider.getBatchData, args=(urls[count * threadnumber:num],))
        threads.append(t)
        spiders.append(spider)
    for t in threads:       # 开启所有线程
        t.start()
    for t in threads:       # 阻塞主线程，直到所有线程全部完成
        t.join()
    data = []
    for s in spiders:
        data += s.getData()
    return data


def update():
    times = time.time()
    # 连接数据库           将数据库配置更改为运行机器的数据库信息
    db = pymysql.connect(host='localhost',user='root',password='123456',database='world')
    cursor = db.cursor()
    #  更改sql语句  只需要 ID url
    select = 'select id,url from c'
    cursor.execute(select)           # 执行语句
    urls = []
    for i in cursor:
        urls.append([i[1],i[0]])
    print('running...')              # 爬取数据
    data = run(urls)
    data = []
    print('Data Acquisition Success')
    #   更改更新的sql语句
    sql = 'update c set number=%s where id=%s'
    cursor.executemany(sql,data)    # 更新到数据库
    db.commit()
    cursor.close()
    db.close()
    print('Successful database update')
    times2 = time.time()
    print(times2-times)
    

if __name__ == '__main__':
    update()

程序员leos

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 数据库爬虫

环境准备已有环境可以忽略python3 和 pip3 安装安装 selenium下载Firefox驱动配置驱动的环境变量，或者将驱动放到已经配置好的文件夹中，类似 window 的 cmd的目录window ：C:\Windows\System32 linux ：/usr/bin；/usr/local/bin安装 pyquery安装pymysq...
复制链接

扫一扫