使用多线程和日志爬取任意网站
import threading
from queue import Queue
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import logging
g_crawl_list = []
g_parse_list = []
class CrawlThread(threading.Thread):
def __init__(self,name,data_queue):
super(CrawlThread,self).__init__()
self.name = name
self.data_queue = data_queue
def run(self):
driver = webdriver.Firefox()
driver.get('http://t.people.com.cn/login.action')
wait = WebDriverWait(driver, 10)
username = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#userName'))
)
username.send_keys('17332335684')
password = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#password'))
)
password.send_keys('zgx675050748')
time.sleep(4)
password.send_keys(Keys.ENTER)
for i in range(0, 5):
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
comments = driver.find_elements_by_xpath("//a[@data-nodetype='btn_comment']")
for comment in comments:
comment.click()
url = driver.page_source
suo = threading.Lock()
suo.acquire()
self.data_queue.put(url)
suo.release()
next = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '.wbp_pagelist_nextbtn'))
)
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.DEBUG)
handler = logging.FileHandler("采集1号.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s,%(name)s,%(levelname)s,%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("Start print log")
logger.debug("Do something")
logger.warning("Something maybe fail.")
logger.info("Finish")
next.click()
time.sleep(3)
driver.quit()
class CrawlThread2(threading.Thread):
def __init__(self,name,data_queue):
super(CrawlThread2,self).__init__()
self.name = name
self.data_queue = data_queue
def run(self):
driver = webdriver.Firefox()
driver.get('http://t.people.com.cn/login.action')
wait = WebDriverWait(driver, 10)
username = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#userName'))
)
username.send_keys('17332335684')
password = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#password'))
)
password.send_keys('zgx675050748')
time.sleep(4)
password.send_keys(Keys.ENTER)
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
comment = driver.find_element_by_xpath("//*[@id='feedListPage']/div/a[5]")
comment.click()
for i in range(0, 5):
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
comments = driver.find_elements_by_xpath("//a[@data-nodetype='btn_comment']")
for comment in comments:
comment.click()
url = driver.page_source
suo = threading.Lock()
suo.acquire()
self.data_queue.put(url)
suo.release()
next = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '.wbp_pagelist_nextbtn'))
)
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.DEBUG)
handler = logging.FileHandler("采集2号.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s,%(name)s,%(levelname)s,%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("Start print log")
logger.debug("Do something")
logger.warning("Something maybe fail.")
logger.info("Finish")
next.click()
time.sleep(3)
driver.quit()
class ParseThread(threading.Thread):
def __init__(self,name,data_queue):
super(ParseThread,self).__init__()
self.name = name
self.data_queue = data_queue
def run(self):
for a in range(0,10):
f = open(str(a)+'.html','a',encoding='utf-8')
f.write(self.data_queue.get())
f.close()
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.DEBUG)
handler = logging.FileHandler("解析1号.log")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s,%(name)s,%(levelname)s,%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.info("Start print log")
logger.debug("Do something")
logger.warning("Something maybe fail.")
logger.info("Finish")
def create_crawl_thread(data_queue):
crawl_name1 = ['采集1号']
for name in crawl_name1:
crawl = CrawlThread(name,data_queue)
g_crawl_list.append(crawl)
crawl_name2 = ['采集2号']
for name in crawl_name2:
crawl = CrawlThread2(name,data_queue)
g_crawl_list.append(crawl)
def create_parse_thread(data_queue):
parse_name = ['解析1号']
for name in parse_name:
parse = ParseThread(name,data_queue)
g_parse_list.append(parse)
def create_queue():
data_queue = Queue()
return data_queue
def main():
data_queue = create_queue()
create_crawl_thread(data_queue)
create_parse_thread(data_queue)
for crawl in g_crawl_list:
crawl.start()
for parse in g_parse_list:
parse.start()
for crawl in g_crawl_list:
crawl.join()
for parse in g_parse_list:
parse.join()
print('所有线程全部结束')
if __name__ == '__main__':
main()