下载谷歌驱动和谷歌浏览器
1.创建一个文件.txt 每一行都存放一个网页链接(对这里的网页进行关键词匹配输出结果到 err.txt文件内)
2.使用python控制谷歌浏览器
from selenium import webdriver
import time
import re
class OpenBrowser(object):
def __init__(self,login,layer=2):
self.all_time = 0
self.keywords = ['航天信息','仔仔']
self.layer = layer # 查询深度
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--window-size=1200,800')
# option.add_argument('--headless')
# 反爬虫
option.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--disable-blink-features=AutomationControlled')
self.driver = webdriver.Chrome('chromedriver', options=option)
self.driver.implicitly_wait(10)
self.driver.get(login)
time.sleep(1)
print(self.driver.title)
data = self.driver.find_element_by_xpath("//html")
self.leachText = data.get_attribute('innerHTML')
def main(self,url):
self.driver.get(url)
time.sleep(0.5)
data = self.driver.find_element_by_xpath("//html")
self.leachText = data.get_attribute('innerHTML')
for key in self.keywords:
keytime=re.findall(key,self.leachText)
if len(keytime)>0:
print(str(keytime[0]) + url + '\n')
self.all_time=self.all_time+1
with open('err.txt', 'a') as f:
f.write(str(keytime[0]) + url + '\n')
print("当前总出现次数:",self.all_time)
url="https://baidu.com"
serch_content = OpenBrowser(login=url)
while True:
with open('url2.txt','r') as f:
data=f.readlines()
print(len(data))
mun = 0
for data_url in data:
mun = mun + 1
print('开始执行',mun,'行数据')
serch_content.main(data_url)