前言
前段时间接了个项目,内容是对谷歌地图搜索的关键词进行摘取并保存,信息主要包含商铺名称和联系电话,当然也有其它的信息,这边为了省事没有记录,源码中相关部分作了留白,有需要的小伙伴可以自行修改
环境准备
- python3.11-我使用的是miniconda直接创建的,没有什么特殊的版本要求
- requirements.txt
ttkbootstrap~=1.10.1 bs4~=0.0.1 beautifulsoup4~=4.12.2 selenium~=4.12.0 pyinstaller~=5.13.2
- 进入环境之后在命令行执行
pip install -r requirements.txt
进行依赖包的安装 - 准备一个带有驱动的chrome(我的资源里有完整的整合包,可以下载解压后直接放进源码目录中,Chrome+Driver整合包,我设置的是0积分下载来着,应该可以直接用的)
- 准备一个ico,作为自己gui的图标
- requirements.txt
源码(无删减无后门纯净版)
1. MyThread.py(多线程的消息队列)
"""
Creation date: 2023/9/1
Creation Time: 14:50
DIR PATH:
Project Name: LocalProject
FILE NAME: MyThread.py
Editor: cuckoo
"""
import queue
import threading
class WorkerThread(threading.Thread):
def __init__(self, task_queue):
super().__init__()
self.task_queue = task_queue
self.daemon = True
self.start()
def run(self):
while True:
func, args, kwargs = self.task_queue.get()
try:
func(*args, **kwargs)
except Exception as e:
print(f"Error executing function: {e}")
finally:
self.task_queue.task_done()
class ThreadedQueue:
def __init__(self):
self.task_queue = queue.Queue()
self.worker = WorkerThread(self.task_queue)
def add_task(self, func, *args, **kwargs):
self.task_queue.put((func, args, kwargs))
- 在这个文件中,我定义了两个类,后续主要调用的是
ThreadedQueue
,通过创建这个类的对象,我们就启动了一个消息队列,处于独立线程,通过修改WorkerThread
的self.daemon
参数可以控制该线程是否为守护线程。 - 我的另一个版本将这个参数作为可变参数通过初始化的时候传入,这个版本里是固定的,有需要的小伙伴可以自己改一下,后续我会单独出一篇文章讲这个多线程消息队列。
2. browser.py(浏览器,也是爬虫主体)
"""
Creation date: 2023/9/14
Creation Time: 11:14
DIR PATH:
Project Name: GoogleMap
FILE NAME: browser.py
Editor: cuckoo
"""
import csv
import os
from subprocess import CREATE_NO_WINDOW
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
CHROME_PATH = "Application/chrome.exe"
CHROME_DRIVER_PATH = "Application/chromedriver.exe"
class WebScraper:
def __init__(self, port=None):
self.__url = "https://www.google.com/maps"
self.__port = port
self.__session = None
option = webdriver.ChromeOptions()
option.binary_location = CHROME_PATH
option.add_argument('--headless')
option.add_argument('----blink-settings=imagesEnabled=false')
option.add_argument('--disable-gpu')
option.add_argument('--disable-javascript')
option.add_argument('--disable-extensions')
option.add_argument('--disable-webgl')
option.add_argument('--disable-plugins')
option.add_argument('--no-sandbox')
option.add_argument('--disable-dev-shm-usage')
option.add_argument('--disable-features=VizDisplayCompositor')
option.add_argument('--disable-blink-features=AutomationControlled')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--window-size=1366,768')
if self.__port:
print(f"正在使用代理端口 [{self.__port}]")
option.add_argument(f'--proxy-server=socks5://127.0.0.1:{self.__port}')
else:
print("未使用代理端口")
service = webdriver.ChromeService(CHROME_DRIVER_PATH)
service.creation_flags = CREATE_NO_WINDOW
self.__driver = webdriver.Chrome(options=option, service=service)
def get_session(self, search_word, wx="", jx="", sf="10"):
session = self.__start_session(search_word, wx, jx, sf)
info_list = self.__extract_shop_info(session)
if info_list:
filename = search_word.replace(" ", "_") + f"_{wx}_{jx}_{sf}".replace(".", "-")
self.csv_writer(filename, info_list)
def __start_session(self, search_word, wx, jx, sf):
# 将搜索词转换为url编码
search_word = search_word.replace(" ", "+")
if wx and jx:
url = f"{self.__url}/search/{search_word}/@{wx},{jx},{sf}z?entry=ttu"
else:
url = f"{self.__url}/search/{search_word}"
print(f"正在加载网页 [{url}]")
self.__driver.get(url)
count = 1
lenth = 0
refresh = 0
while True:
try:
if "没有其他结果了。" in self.__driver.page_source:
print("加载完毕")
break
WebDriverWait(self.__driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "qjESne")))
scroll_box = self.__driver.find_element(
By.XPATH, '/html/body/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]')
self.__driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_box)
print(f"加载第下一页")
lenth_temp = self.__extract_shop_info(self.__driver.page_source)
if lenth == lenth_temp:
if refresh > 5:
self.__driver.refresh()
else:
refresh += 1
else:
lenth = lenth_temp
count += 1
except Exception:
if "没有其他结果了。" in self.__driver.page_source:
print("加载完毕")
break
sleep(1)
return self.__driver.page_source
@staticmethod
def __extract_shop_info(html_content: str) -> list:
print("正在提取信息")
soup = BeautifulSoup(html_content, 'html.parser')
target_divs = soup.find_all('div', class_='Nv2PK')
result = []
for div in target_divs:
div: BeautifulSoup
info_block = div.find('div', class_='Z8fK3b')
texts = info_block.stripped_strings
text_list = [text for text in texts]
if len(text_list) >= 2:
name = text_list[0].strip()
phone = text_list[-1].strip()
# 如果电话号码不是由数字、+、空格组成,则置为空
if not all([char.isdigit() or char in ['+', ' '] for char in phone]):
phone = ""
link = div.find('a', class_='hfpxzc')
if link:
href = link.get('href')
else:
href = ""
result.append([name, phone, href])
print(f"提取完毕,共{len(result)}条信息")
return result
@staticmethod
def csv_writer(filename: str, data: list):
# 创建输出文件夹,允许已存在
os.makedirs("output", exist_ok=True)
print(f"正在写入{filename}.csv")
title = ['店铺名称', '联系电话', '网址']
with open(f'output/{filename}.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(title)
writer.writerows(data)
print("写入完毕")
def driver_del(self):
try:
self.__driver.quit()
except Exception:
pass
def __del__(self):
try:
self.driver_del()
except Exception:
pass
if __name__ == '__main__':
keyword_ = "重庆小面"
scraper = WebScraper(port="")
scraper.get_session(keyword_)
- 在这个文件中,我定义了爬虫的主体类
WebScraper
,初始化的时候通过参数传入代理端口,默认是本机的socks5端口,如果你的代理是http的,那么在
这一行,直接把socks5替换成http就可以了option.add_argument(f'--proxy-server=socks5://127.0.0.1:{self.__port}')
- 代码运行的逻辑是:
- 根据传入参数获取
关键词(serch_word)、纬度(wx)、经度(jx)、搜索范围等级(sf)
并将其拼接成完整的请求url - 定位搜索结果框,持续下拉到底部,触发再加载后继续下拉,直至没有更多搜索结果
- 解析页面,得到搜索结果列表,
text_list
的其它元素是搜索结果的其它内容,可以按需修改 - 将结果存入csv文件
- 根据传入参数获取
3. main.py(主文件,也是gui界面)
"""
Creation date: 2023/9/14
Creation Time: 11:14
DIR PATH:
Project Name: GoogleMap
FILE NAME: main.py
Editor: cuckoo
"""
import datetime
import sys
from ttkbootstrap import *
from ttkbootstrap.dialogs import Messagebox
from ttkbootstrap.scrolled import ScrolledText
from MyThread import ThreadedQueue
from browser import WebScraper
class TextRedirector:
def __init__(self, update_func):
self.update_func = update_func
self.tq = ThreadedQueue()
self.update_func("日志重定向完成,日志队列已启动")
def write(self, string):
self.tq.add_task(self.write_task, string)
def write_task(self, string):
string = str(string).strip()
self.update_func(string)
def flush(self):
pass
class SimpleGUI:
def __init__(self, root):
# 根窗口设置
self.root = root
self.root.bind("<Control-w>", lambda event: self.close_all())
self.browsers = []
# StringVar变量
self.input_var = tk.StringVar()
self.proxy_port_var = tk.StringVar()
self.jx_port_var = tk.StringVar()
self.wx_port_var = tk.StringVar()
self.sf_port_var = tk.StringVar(value="10")
# 上方的label
frame1 = Frame(root)
Label(frame1, text="搜索内容").grid(row=0, column=0, padx=10, pady=10)
Entry(frame1, textvariable=self.input_var, width=52).grid(row=0, column=1, padx=10, pady=10)
Label(frame1, text="代理端口号").grid(row=0, column=2, padx=10, pady=10)
Entry(frame1, textvariable=self.proxy_port_var, width=10).grid(row=0, column=3, padx=10, pady=10)
frame1.grid(row=0, column=0, padx=10, pady=10)
frame2 = Frame(root)
Label(frame2, text="经线").grid(row=0, column=0, padx=10, pady=10)
Entry(frame2, textvariable=self.jx_port_var, width=20).grid(row=0, column=1, padx=10, pady=10)
Label(frame2, text="纬线").grid(row=0, column=2, padx=10, pady=10)
Entry(frame2, textvariable=self.wx_port_var, width=20).grid(row=0, column=3, padx=10, pady=10)
Label(frame2, text="精度").grid(row=0, column=4, padx=10, pady=10)
Combobox(frame2, values=[str(i) for i in range(3, 21)], width=4, textvariable=self.sf_port_var,
state="readonly").grid(row=0, column=5, padx=10, pady=10)
Button(frame2, text="搜索", command=self.search_action, width=6).grid(row=0, column=6, padx=10, pady=10)
frame2.grid(row=1, column=0, padx=10, pady=10)
# 下方的日志窗口
frame3 = Frame(root)
_ = ScrolledText(frame3, width=90, height=10, state=tk.DISABLED,
wrap=WORD, autohide=True, bootstyle='round')
_.grid(row=0, column=0, padx=10, pady=10)
self.log_window = _.text
frame3.grid(row=2, column=0, padx=10, pady=10)
tr = TextRedirector(self.write_to_log)
sys.stdout = tr
sys.stderr = tr
def search_action(self):
# 获取输入框内容和代理端口号
content = self.input_var.get().strip()
if not content:
Messagebox.show_warning(title="警告", message="输入内容不能为空")
return
jx = self.jx_port_var.get().strip()
wx = self.wx_port_var.get().strip()
if not jx or not wx:
print("经纬度为空")
proxy_port = self.proxy_port_var.get().strip()
sf = self.sf_port_var.get().strip()
scraper = WebScraper(port=proxy_port if proxy_port else None)
tq = ThreadedQueue()
tq.add_task(scraper.get_session, content, wx, jx, sf)
self.browsers.append(scraper)
return
def write_to_log(self, message):
if not message.strip():
return
self.log_window.config(state=tk.NORMAL)
msg = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " " + message + "\n"
self.log_window.insert(tk.END, msg)
self.log_window.see(tk.END)
self.log_window.config(state=tk.DISABLED)
def close_all(self):
for browser in self.browsers:
browser: WebScraper
browser.driver_del()
self.root.destroy()
def __del__(self):
self.close_all()
if __name__ == "__main__":
nowdate = datetime.now().date()
enddate = datetime(2999, 9, 14).date()
if nowdate > enddate:
Messagebox.show_warning(title="警告", message="软件已过期,请联系作者")
exit(0)
_root = Window(
title=f"GoogleMap-Searcher",
iconphoto="ico_cuckoo.png",
themename="superhero",
resizable=(False, False)
)
app = SimpleGUI(_root)
_root.mainloop()
- 这个文件首先定义了日志重定向类,用于后续将程序的输出重定向到gui的日志文本框内
- 代码的运行逻辑是:
- 绘制gui
- 将用户输入传入之前的爬虫类里
- 似乎也没什么好说的了,我自认为代码很清晰明了的 QAQ
- ttkbootstrap里还有很多种gui风格,在
themename
后的参数可以更改的