未完成的代码-CSDN博客

本文链接：https://blog.csdn.net/ka_q_yin/article/details/146914535
python 3.8.20
selenium 4.24.0
pyautogui 0.9.54
from time import sleep
import re
import json
import pyautogui
import random
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
# from selenium.common.exceptions import StaleElementReferenceException
# from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import lxml.html

# 实现 xpath_union 函数：解析页面源码并通过 XPath 提取文本
def xpath_union(page_source, xpath_expr, default=""):
    try:
        tree = lxml.html.fromstring(page_source)
        results = tree.xpath(xpath_expr)
        #print('results='+str(results))
        if results:
            # 如果返回的是字符串列表，则连接它们
            if isinstance(results[0], str):
                joined = " ".join([r.strip() for r in results if isinstance(r, str) and r.strip()])
                if joined:
                    return joined
                else:
                    # 如果为空，则尝试从节点中提取文本内容
                    return " ".join([r.text_content().strip() for r in results if hasattr(r, 'text_content') and r.text_content().strip()])
            else:
                # 如果结果是节点对象，则提取其文本内容
                return " ".join([r.text_content().strip() for r in results if hasattr(r, 'text_content') and r.text_content().strip()])
        else:
            return default
    except Exception:
        return default

# def get_track(distance):
#     """
#     根据偏移量获取移动轨迹
#     :param distance: 偏移量
#     :return: 移动轨迹
#     """
#     # 移动轨迹
#     track = []
#     # 当前位移
#     current = 0
#     # 减速阈值
#     mid = distance * 4 / 5
#     # 计算间隔
#     t = 0.2
#     # 初速度
#     v = 0

#     while current < distance:
#         if current < mid:
#             # 加速度为正 2
#             a = 4
#         else:
#             # 加速度为负 3
#             a = -3
#         # 初速度 v0
#         v0 = v
#         # 当前速度 v = v0 + at
#         v = v0 + a * t
#         # 移动距离 x = v0t + 1/2 * a * t^2
#         move = v0 * t + 1 / 2 * a * t * t
#         # 当前位移
#         current += move
#         # 加入轨迹
#         track.append(round(move))
#     return track

# def move_to_gap(browser,slider, tracks):
#     """
#     拖动滑块到缺口处
#     :param slider: 滑块
#     :param tracks: 轨迹
#     :return:
#     """
#     ActionChains(browser).click_and_hold(slider).perform()
#     for x in tracks:
#         ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()
#     time.sleep(0.5)
#     ActionChains(browser).release().perform()
    
# screen_width, screen_height = pyautogui.size()
# center_x = screen_width / 2
# center_y = screen_height / 2

def verification(web):
    try:
        if WebDriverWait(web, 10).until(EC.element_to_be_clickable(web.find_element(By.XPATH,'//*[@id="nc_1_n1z"]'))):
            #print('found //*[@id="nc_1_n1z"]')
            
            web.execute_script("location.reload()")
            
            # 在屏幕中心点击
            pyautogui.click(center_x, center_y)
            sleep(0.3)

            pyautogui.moveTo(screen_width*0.44,screen_height*0.41)
            pyautogui.drag(xOffset=800, yOffset=50, duration=0.2, button='left')
            sleep(1)
    except:
        #print('//*[@id="nc_1_n1z not found')
        pass
    
    # try:
    #     if WebDriverWait(web, 10).until(EC.element_to_be_clickable(web.find_element(By.XPATH,'//*[@id="nc_1_n1z"]'))):
    #         print('found //*[@id="nc_1_n1z"]')
    #         # 获取小滑块元素
    #         drawable_block = web.find_element(By.XPATH, '//*[@id="nc_1_n1z"]')
    #         block_width=300
            
    #         web.implicitly_wait(5)
    #         # # 按下小滑块按钮不动
    #         # ActionChains(web).click_and_hold(drawable_block).perform()
            
    #         # # 移动小滑块，模拟人的操作，一次次移动一点点
    #         # i = 0
    #         # moved = 0
    #         # while moved < block_width:
    #         #     x = random.randint(1, 5)
    #         #     moved += x
    #         #     ActionChains(web).move_by_offset(xoffset=x, yoffset=0).perform()
    #         #     print("第{}次移动后，位置为{}".format(i, moved))
    #         #     i += 1
    #         # 移动完之后，松开鼠标
    #         # ActionChains(web).release().perform()
            
    #         track = get_track(block_width)
    #         move_to_gap(web,drawable_block, track)
            
            
    #         # 整体等待5秒看结果
    #         sleep(5)
    #         try:
    #             if WebDriverWait(web, 10).until(
    #                 EC.presence_of_element_located((By.XPATH, '//div[@id="`nc_1_refresh1`"]'))
    #             ):
    #                 print('found retry block')
    #                 verification(web)
    #         except Exception as ex:
    #             print('retry block not found')
    #             print(ex)
    #             #print(web.page_source)
    # except Exception as e:
    #     print(f'e={e}')
    #     print('no verification block')
        

# ----------------------- Selenium 爬虫代码 -----------------------
url = 'https://we.51job.com/pc/search?keyword=&searchType=2&sortType=0&metro='

# 设置要爬取的页数，最多50
total_page = 2

# 定义 User-Agent 列表
user_agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR "
    "2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center "
    "PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET "
    "CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR "
    "3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR "
    "2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; "
    ".NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) "
    "Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 "
    "Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 "
    "Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 "
    "TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 "
    "Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET "
    "CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 "
    "Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET "
    "CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET "
    "CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; "
    "360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET "
    "CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) "
    "Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 "
    "Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) "
    "Firefox/3.6.10 "
]

# 随机选择一个 User-Agent 和代理 IP
selected_user_agent = random.choice(user_agents)
print('selected_user_agent='+selected_user_agent)
# 配置 Chrome 选项
opt = Options()
#opt.add_argument("--headless")  # 无界面启动，可根据需要取消注释
opt.add_experimental_option('useAutomationExtension', False)  # 禁用 Chrome 自动化扩展
opt.add_experimental_option('excludeSwitches', ['enable-automation'])  # 排除自动化标识
opt.add_argument('--disable-blink-features=AutomationControlled')  # 禁用自动化控制的 Blink 功能
opt.add_argument('--ignore-certificate-errors')  # 禁用“您的连接不是私密连接”
opt.add_argument("--disk-cache-size=0")

# 设置随机 User-Agent
opt.add_argument(f'user-agent={selected_user_agent}')

service = Service(r'C:\Chrome_11.9\chromedriver.exe')
web = Chrome(options=opt, service=service)
web.get(url)
web.maximize_window()
sleep(7)



#点击职能页面，同时表明页面已加载完毕
web.execute_script("arguments[0].click();",
WebDriverWait(web, 10).until(EC.element_to_be_clickable(
    web.find_element(By.XPATH,'//*[@id="app"]/div/div[2]/div/div/div[1]/div[1]/div/div[1]/div[3]/p[1]'))
                            )
                  )



#先获得本次会话的dialogNum
dialogNum=web.find_element(By.CLASS_NAME,
                'el-dialog__wrapper.jbs_cascader_dialog.functype_cascader_dialog.jbs_multiple_dialog.jbs_orange_dialog'
    ).get_attribute('id')
print("dialogNumber is "+dialogNum)


# 遍历一级列表。点击操作会修改页面 DOM，从而使之前获取的元素引用过期（StaleElementReferenceException），因此需要动态定位
level1_xpath = f'//*[@id="'+dialogNum+'"]/div/div[2]/div[2]/ul[1]/li'
level1_count = len(web.find_elements(By.XPATH, level1_xpath))
for l1_index in range(1, level1_count + 1):
    l1 = web.find_element(By.XPATH, f'//*[@id="'+dialogNum+'"]/div/div[2]/div[2]/ul[1]/li['+str(l1_index)+']')
    web.execute_script("arguments[0].scrollIntoView();", l1)
    l1_text=l1.text
    print('!!!!!!!!!!'+l1_text+'!!!!!!!!!!')
    web.execute_script("arguments[0].click();", l1)

    # 遍历二级列表
    level2_xpath = f'//*[@id="'+dialogNum+'"]/div/div[2]/div[2]/ul[2]/li'
    level2_count = len(web.find_elements(By.XPATH, level2_xpath))
    print('level2_count'+str(level2_count))

    #count_for_l2_scroll=0
    safe_filename = re.sub(r'[\\/*?:"<>|]', '_', l1_text)
    with open(f'Z:\pyscrawler_output\{safe_filename}_output.json', 'w') as f:
        for l2_index in range(1, level2_count+1):
            print('l2_index='+str(l2_index))
            
            l2 = web.find_element(By.XPATH, f'//*[@id="{dialogNum}"]/div/div[2]/div[2]/ul[2]/li[{str(l2_index)}]')
            web.execute_script("arguments[0].scrollIntoView();", l2)
            
            l2_text=l2.text
            print('//'+l2_text+'//')
            web.execute_script("arguments[0].click();", l2)
            #count_for_l3_scroll=0
            sleep(1)
            # 遍历三级列表
            level3_xpath = f'//*[@id="'+dialogNum+'"]/div/div[2]/div[2]/ul[3]/li'
            level3_count = len(web.find_elements(By.XPATH, level3_xpath))
            print('level3_count='+str(level3_count))
            for l3_index in range(1, level3_count+1):
                print('l3 index='+str(l3_index))
                #等待当前的职位li控件可被点击，不延时可能导致三级列表选择失败
                current_xpath = f'//*[@id="{dialogNum}"]/div/div[2]/div[2]/ul[3]/li['+str(l3_index)+']'
                try:
                    l3=WebDriverWait(web, 10).until(
                    EC.element_to_be_clickable((By.XPATH, current_xpath))
                )
                    l3_text=l3.text
                    web.execute_script("arguments[0].scrollIntoView();", l3)
                except Exception as StaleErr3:
                    print(StaleErr3)
                    print('l2 mutated to stale element, trying to click l1 and l2 again.')
                    web.execute_script("arguments[0].scrollIntoView();", l1)
                    web.execute_script("arguments[0].click();", l1)
                    sleep(0.5)
                    l2 = web.find_element(By.XPATH, f'//*[@id="{dialogNum}"]/div/div[2]/div[2]/ul[2]/li[{str(l2_index)}]')
                    web.execute_script("arguments[0].scrollIntoView();", l2)
                    web.execute_script("arguments[0].click();", l2)
                    sleep(0.5)
                    l3=web.find_element(By.XPATH,current_xpath)
                    web.execute_script("arguments[0].scrollIntoView();", l3)
                    
                l3_text=l3.text
                try:
                    web.execute_script("arguments[0].click();",web.find_element(By.XPATH, '//*[@id="_selectedFunctypeListRef"]/div[2]/span/i'))
                except:
                    print('no cross')                                   
                #可能由于该动态列表使用前端框架的JavaScript监听器绑定点击事件，故不可以直接使用selenium的click()，
                #必须使用JavaScript动作模拟，否则无法选中三级列表的li控件
                print('----------'+l3_text+'----------')
                web.execute_script("arguments[0].click();", l3)
                sleep(0.5)
                #点击确定，关闭职能界面
                #print(web.find_element(By.XPATH,'//*[@id="_selectedFunctypeListRef"]/div[2]').text)
                element=web.find_element(By.XPATH,'//*[@id="'+dialogNum+'"]/div/div[3]/div/button[2]')
                web.execute_script("arguments[0].click();", element)
                sleep(2)
                try:
                    #尝试找到第一条信息
                    WebDriverWait(web, 10).until(EC.element_to_be_clickable(
                        web.find_element(By.XPATH,
                                    '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div[1]/div/div')))
                    #print('找到第一条信息')
                except Exception as err:
                    sleep(1)
                    #print(f'err={err}')
                    #尝试找到滑块验证
                    #verification(web)
                # 保存初始窗口句柄            
                original_window = web.current_window_handle
                #遍历每一页
                for page in range(total_page):
                    if page>1:
                        # 点击下一页
                        try:
                            WebDriverWait(web, 10).until(
                                EC.element_to_be_clickable(
                                    web.find_element(By.XPATH,
                                    '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[3]/div/div/div/button[2]'))
                            ).click()
                        except:
                            #没有下一页了
                            break
                    
                    #页面第一条信息可点击，表明页面已加载完毕
                    try:
                        WebDriverWait(web, 10).until(EC.element_to_be_clickable(
                            web.find_element(By.XPATH,
                                        '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div[1]/div/div')))
                    except:
                        pass
                        #print('没找到第一条')
                    # 获取当前页面所有职位信息元素
                    job_info = web.find_elements(By.XPATH,
                                    '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div/div/div')
                    for one_job in job_info:
                        # 获取存放关键信息的属性并转换为字典
                        job_attributes = one_job.get_attribute('sensorsdata')
                        if job_attributes is None:
                            continue
                        try:
                            job_attributes = json.loads(job_attributes)
                        except Exception:
                            continue
                        search_result = {}
                        search_result['level1']=l1_text
                        search_result['level2']=l2_text
                        search_result['level3']=l3_text
                        
                        # 职位id
                        search_result['jobId'] = job_attributes.get('jobId')
                        # 职位名称
                        search_result['jobTitle'] = job_attributes.get('jobTitle')
                        #print(str(i)+'. '+search_result['jobTitle'])
                        # 职位薪水
                        search_result['jobSalary'] = job_attributes.get('jobSalary')
                        # 学历要求
                        try:
                            search_result['jobDegree'] = job_attributes.get('jobDegree')
                        except Exception:
                            search_result['jobDegree'] = "null"
                        # 经验年限
                        try:
                            search_result['jobYear'] = job_attributes.get('jobYear')
                        except Exception:
                            search_result['jobYear'] = "null"
                        # 公司id
                        search_result['companyId'] = job_attributes.get('companyId')
                        # 公司名称
                        try:
                            search_result['company_name'] = one_job.find_element(By.XPATH, './div[4]/div[1]/a').text
                        except Exception:
                            search_result['company_name'] = "null"
                        # 工作地点
                        search_result['jobArea'] = job_attributes.get('jobArea')
                        # 学历要求
                        search_result['jobDegree'] = job_attributes.get('jobDegree')
                        # 公司类型
                        try:
                            search_result['company_type'] = one_job.find_element(By.XPATH, './div[4]/div[1]/span[1]').text
                        except Exception:
                            search_result['company_type'] = "null"
                        # 公司背景
                        try:
                            search_result['company_background'] = one_job.find_element(By.XPATH, './div[4]/div[1]/span[2]').text
                        except Exception:
                            search_result['company_background'] = "null"
                        # 公司规模
                        try:
                            search_result['company_scale'] = one_job.find_element(By.XPATH, './div[4]/div[1]/span[3]').text
                        except Exception:
                            search_result['company_scale'] = "null"
                        # hr活跃
                        try:
                            search_result['hr_active'] = one_job.find_element(By.XPATH, './div[4]/div[1]/span[4]').text
                        except Exception:
                            search_result['hr_active'] = "null"
                        # 发布时间
                        search_result['jobTime'] = job_attributes.get('jobTime')

                        sleep(random.random() * 3)
                        # 点击进入职位详情（二级页面）
                        web.execute_script("arguments[0].click();",one_job.find_element(By.XPATH, './div[2]'))
                        # 切换到新打开的窗口
                        for window_handle in web.window_handles:
                            if window_handle != original_window:
                                web.switch_to.window(window_handle)
                                break
                        
                        verification(web)
                        # 获取当前二级页面的 URL
                        url_current = web.current_url
                        search_result['jobDetail_url']=url_current
                        # 使用上面自定义的 xpath_union 提取页面信息，如未获取到则返回当前 URL
                        job_information = xpath_union(web.page_source, 
                                        '/html/body/div[2]/div/div[3]/div[1]/div/text()', default='null')
    
                        # 关闭二级页面并返回主窗口
                        web.close()
                        web.switch_to.window(original_window)
                        # 整合爬取数据
                        search_result['job_information'] = job_information
                        
                        #print(search_result)
                        json.dump(search_result, f)
                        f.write('\n')
    
                    #print('page '+str(page+1)+' finished')

                web.quit()
                selected_user_agent = random.choice(user_agents)
                print('selected_user_agent='+selected_user_agent)
                opt.add_argument(f'user-agent={selected_user_agent}')
                web = Chrome(options=opt, service=service)
                web.get(url)
                web.maximize_window()
                sleep(7)
                #重新打开职能界面
                WebDriverWait(web, 10).until(EC.element_to_be_clickable(
                    web.find_element(By.XPATH,
                    '//*[@id="app"]/div/div[2]/div/div/div[1]/div[1]/div/div[1]/div[3]'))
                ).click()
            
web.quit()