Chrome Temp files Location

本文介绍了一种设置Chrome浏览器缓存目录及大小的方法,通过修改启动参数实现自定义缓存路径和调整缓存大小。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 

"X:\Chrome_HOME\Chrome.exe" --disk-cache-dir="X:\temp\TMP" --disk-cache-size=150000000
 

 

代码: import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By import csv import os import time import json class spider(object): def __init__(self,type,page): self.type = type #岗位关键字 self.page = page #page是当前的页码数 self.spiderUrl = 'https://www.zhipin.com/web/geek/job?query=%s&city=101281100&page=%s' def startBrower(self): service = Service('C:/Users/23653/PycharmProjects/chromedriver.exe') options = webdriver.ChromeOptions() #使用浏览器复用来防反爬虫,使用前,不允许任何浏览器在运行 options.add_experimental_option('debuggerAddress','localhost:9222') #options.add_experimental_option('excludeSwitches', ['enable-automation']) brower = webdriver.Chrome(service=service, options=options) return brower def main(self, page): #page是要爬取的总页码数 #if self.page > page:return brower = self.startBrower() print("正在爬取页面路径:" + self.spiderUrl % (self.type, self.page)) brower.get(self.spiderUrl % (self.type, self.page)) time.sleep(15) #等待页面时间15秒 job_list = brower.find_elements(by=By.XPATH, value= '//ul[@class="job-list-box"]/li') for index, job in enumerate(job_list): try: jobData = [] print("正在爬取第%d个数据" % (index + 1)) # 岗位名字 title = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]/span[@class='job-name']").text # 省份地址 addresses = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]/span[@class='job-area-wrapper']/span").text.split('·') address = addresses[0] # 行政区 if len(addresses) != 1: dist = addresses[1] else: dist = '' # 岗位 type = self.type tag_list = job.find_elements(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/ul[@class='tag-list']/li") if len(tag_list) == 2: # 学历 educational = tag_list[1].text # 工作经验 workExperience = tag_list[0].text else: # 学历 educational = tag_list[2].text # 工作经验 workExperience = tag_list[1].text #hr名字 hrName = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/div[@class='info-public']").text #hr职位 hrWork = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/div[@class='info-public']/em").text #工作标签 workTag = job.find_elements(by=By.XPATH,value="./div[contains(@class,'job-card-footer')]/ul[@class='tag-list']/li") workTag = json.dumps(list(map(lambda x: x.text, workTag))) #是否是实习生 pratice = 0 salaries = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/span[@class='salary']").text if salaries.find('K') != -1: salaries = salaries.split('·') if len(salaries) == 1: # 薪资 salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K','').split('-'))) #年底多少月薪 salaryMonth = '0薪' else: # 薪资 salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-'))) # 年底多少月薪 salaryMonth = salaries[1] else: # 薪资 salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-'))) # 年底多少月薪 salaryMonth = '0薪' pratice = 1 #公司名字 companyTitle = job.find_element(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-info')]/h3/a").text #公司头像 companyAvatar = job.find_element(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-logo')]/a/img").get_attribute("src") companyInfos = job.find_elements(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-info')]/ul[@class='company-tag-list']/li") if len(companyInfos) == 3: #公司性质 companyNature = companyInfos[0].text #公司状态 companyStatus = companyInfos[1].text #公司人数 companyPeoples = companyInfos[2].text if companyPeoples != '1000人以上': companyPeople = list(map(lambda x: int(x),companyInfos[2].text.replace('人','').split('-'))) else: companyPeople = [0,10000] else: # 公司性质 companyNature = companyInfos[0].text # 公司状态 companyStatus = '未融资' # 公司人数 companyPeoples = companyInfos[1].text if companyPeoples != '1000人以上': companyPeople = list(map(lambda x: int(x), companyInfos[1].text.replace('人', '').split('-'))) else: companyPeople = [0, 10000] #公司福利 companyTags = job.find_element(by=By.XPATH,value='./div[contains(@class,"job-card-footer")]/div[@class="info-desc"]').text if not companyTags: companyTags = '无' else: companyTags = json.dumps(companyTags.split(',')) #岗位详情页链接 detailUrl = job.find_element(by=By.XPATH,value='.//a[@class="job-card-left"]').get_attribute('href') #公司详情页链接 companyUrl = job.find_element(by=By.XPATH,value='.//div[@class="job-card-right"]/div[@class="company-info"]/h3/a').get_attribute('href') jobData.append(title) jobData.append(address) jobData.append(type) jobData.append(educational) jobData.append(workExperience) jobData.append(workTag) jobData.append(salary) jobData.append(salaryMonth) jobData.append(companyTags) jobData.append(hrWork) jobData.append(hrName) jobData.append(pratice) jobData.append(companyTitle) jobData.append(companyAvatar) jobData.append(companyNature) jobData.append(companyStatus) jobData.append(companyPeople) jobData.append(detailUrl) jobData.append(companyUrl) jobData.append(dist) self.save_to_csv(jobData) except: pass self.page += 1 self.main(page) #数据清洗 def clear_csv(self): df = pd.read_csv('./temp.csv') df.dropna(inplace=True) df.drop_duplicates(inplace=True) df['salaryMonth'] = df['salaryMonth'].map(lambda x: x.replace('薪','')) print("总数据为%d"%df.shape[0]) return df.values def save_to_csv(selfself,rowData): with open('./temp.csv','a',newline='',encoding='utf-8') as wf: writer = csv.writer(wf) writer.writerow(rowData) def init(self): if not os.path.exists('./temp.csv'): with open('./temp.csv','a',newline='',encoding = 'utf-8') as wf: writer = csv.writer(wf) writer.writerow(["title","address","type","educational","workExperience","workTag","salary","salaryMonth", "companyTags","hrWork","hrName","pratice","companyTitle","companyAvatar","companyNature", "companyStatus","companyPeople","detailUrl","companyUrl","dist"]) if __name__ == "__main__": spiderObj = spider('java', 1)# 职业 与 初始页面 spiderObj.init() spiderObj.main(10)#爬取11页,初始页1+爬取页10 输出结果:\a3\python.exe D:\数据可视化\main.py Traceback (most recent call last): File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 64, in _binary_paths raise ValueError(f"The path is not a valid file: {path}") ValueError: The path is not a valid file: C:/Users/23653/PycharmProjects/chromedriver.exe The above exception was the direct cause of the following exception: Traceback (most recent call last): File "D:\数据可视化\main.py", line 210, in <module> spiderObj.main(10)#爬取11页,初始页1+爬取页10 File "D:\数据可视化\main.py", line 28, in main brower = self.startBrower() File "D:\数据可视化\main.py", line 22, in startBrower brower = webdriver.Chrome(service=service, options=options) File "D:\a3\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 47, in __init__ super().__init__( File "D:\a3\lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 53, in __init__ if finder.get_browser_path(): File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 47, in get_browser_path return self._binary_paths()["browser_path"] File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 78, in _binary_paths raise NoSuchDriverException(msg) from err selenium.common.exceptions.NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
07-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值