提示:点开主页可以下载全部代码跟数据进行参考!
前言
本文仅用于技术分享交流,如有用于其他用途的,自行承担后果
一、封装selenium
from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
def share_browser():
# 初始化
chrome_options = uc.ChromeOptions()
# chrome_options.add_argument((f'--proxy-server=http://27.150.162.104:4278'))
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument( 'service_args = [’–ignore - ssl - errors = true’, ‘–ssl - protocol = TLSv1’]') # 忽略ssl验证
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("disable-infobars")
# options.add_argument('--start-maximized')
# options.add_argument('--start-fullscreen')
chrome_options.add_argument('--single-process')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("log-level=3")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--incognito")
chrome_options.add_argument('--no-first-run')
chrome_options.add_argument('--no-service-autorun')
chrome_options.add_argument('--no-default-browser-check')
chrome_options.add_argument('--password-store=basic')
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36')
# 这个路径是谷歌浏览器的路径
path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
chrome_options.binary_location = path
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.maximize_window()
return browser
二、抓取流程
1.翻页功能
代码如下(示例):
def get_href(page):
print(f'获取第:{page}页数据...')
url = f'https://bj.zu.ke.com/zufang/pg{page}rs%E5%85%AC%E5%AF%93/#contentList'
res = requests.get(url)
tree = etree.HTML(res.text)
href_list = tree.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/@href')
return href_list
2.通过xpath获取数据
代码如下(示例):
def get_content(href):
url = 'https://bj.zu.ke.com/' + href
with open('used_url.txt', 'r', encoding='utf-8') as fp:
used_url_list = [f.strip('\n') for f in fp.readlines()]
data_list = []
if url not in used_url_list:
try:
driver.get(url)
# wait.until(EC.invisibility_of_element_located((By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img')))
# driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div[10]/div/div[1]/img').click()
tree = etree.HTML(driver.page_source)
# 标题
title = tree.xpath('/html/body/div[3]/div[1]/div[3]/p/text()')[0].strip('\n').strip(' ')
# 价格
price = tree.xpath('//*[@id="aside"]/div[1]/span/text()') # 元/月
price = price[0] if price else ''
# 房屋类型
house_class = tree.xpath('//*[@id="aside"]/ul/li[2]/text()')[0]
# 面积
area = tree.xpath('//*[@id="info"]/ul[1]/li[2]/text()')[0].replace('面积:', ' ')
# 维护
maintain = tree.xpath('//*[@id="info"]/ul[1]/li[5]/text()')[0].replace('维护:', ' ')
# 楼层
floor = tree.xpath('//*[@id="info"]/ul[1]/li[8]/text()')[0].replace('楼层:', ' ')
# 车位
parking_lot = tree.xpath('//*[@id="info"]/ul[1]/li[11]/text()')[0].replace('车位:', ' ')
# 用电
use_cable = tree.xpath('//*[@id="info"]/ul[1]/li[14]/text()')[0].replace('用电:', ' ')
# 采暖
heating = tree.xpath('//*[@id="info"]/ul[1]/li[17]/text()')[0].replace('采暖:', ' ')
# 朝向
point = tree.xpath('//*[@id="info"]/ul[1]/li[3]/text()')[0].replace('朝向:', ' ')
# 入住时间
move_into_time = tree.xpath('//*[@id="info"]/ul[1]/li[6]/text()')[0].replace('入住:', ' ')
# 电梯
lift = tree.xpath('//*[@id="info"]/ul[1]/li[9]/text()')[0].replace('电梯:', ' ')
# 用水
use_water = tree.xpath('//*[@id="info"]/ul[1]/li[12]/text()')[0].replace('用水:', ' ')
# 燃气
gas = tree.xpath('//*[@id="info"]/ul[1]/li[15]/text()')[0].replace('燃气:', ' ')
# 租期
tenancy = tree.xpath('//*[@id="info"]/ul[2]/li[2]/text()')[0].replace('租期:', ' ')
# 状态
# 洗衣机
xiyiji = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[2]/@class')[0]
status1 = '有' if 'no' not in xiyiji else '无'
# 空调
kongtiao = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[3]/@class')[0]
status2 = '有' if 'no' not in kongtiao else '无'
# 衣柜
yigui = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[4]/@class')[0]
status3 = '有' if 'no' not in yigui else '无'
# 电视
dianshi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[5]/@class')[0]
status4 = '有' if 'no' not in dianshi else '无'
# 冰箱
bingxiang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[6]/@class')[0]
status5 = '有' if 'no' not in bingxiang else '无'
# 热水器
reshuqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[7]/@class')[0]
status6 = '有' if 'no' not in reshuqi else '无'
# 床
chuang = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[8]/@class')[0]
status7 = '有' if 'no' not in chuang else '无'
# 暖气
nuanqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[9]/@class')[0]
status8 = '有' if 'no' not in nuanqi else '无'
# 宽带
kuandai = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[10]/@class')[0]
status9 = '有' if 'no' not in kuandai else '无'
# 天然气
tianranqi = tree.xpath('/html/body/div[3]/div[1]/div[3]/div[3]/div[2]/ul/li[11]/@class')[0]
status10 = '有' if 'no' not in tianranqi else '无'
# 地铁站距离
# distance = tree.xpath('//*[@id="around"]/ul[2]/li[1]/span[2]/text()')
# distance = distance[0] if distance else ' '
# 滚动到想要元素
ditu = driver.find_element(By.XPATH, '//*[@id="around"]/h3')
driver.execute_script("arguments[0].scrollIntoView();", ditu)
# wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/')))
time.sleep(1)
# driver.execute_script('window.scrollBy(0,2000)')
# js_button = 'document.documentElement.scrollTop=2400' //*[@id="around"]/h3
# driver.execute_script(js_button)
# time.sleep(2)
# js_button_ = 'document.documentElement.scrollTop=2600'
# driver.execute_script(js_button_)
# # wait.until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="around"]/ul[1]/li[2]')))
# time.sleep(2)
# js_button_ = 'document.documentElement.scrollTop=300'
# driver.execute_script(js_button_)
# time.sleep(0.5)
# 地铁
tree = etree.HTML(driver.page_source)
metro = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[1]/span/text()')
metro = metro[0] if metro else ''
# 公交
button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[2]')
driver.execute_script("$(arguments[0]).click()", button)
time.sleep(1)
tree = etree.HTML(driver.page_source)
bus_route = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[2]/text()')
bus_route = bus_route[0] if bus_route else ''
# 学校
button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[3]')
driver.execute_script("$(arguments[0]).click()", button)
time.sleep(2)
tree = etree.HTML(driver.page_source)
school = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
school = str(school).strip('[').strip(']')
# 医院
button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[4]')
driver.execute_script("$(arguments[0]).click()", button)
time.sleep(2)
tree = etree.HTML(driver.page_source)
hospital = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li[1]/p[1]/span/text()')
hospital = str(hospital).strip('[').strip(']')
# # 购物
# button = driver.find_element(By.XPATH, '//*[@id="around"]/div/div[2]/ul[1]/li[7]')
# driver.execute_script("$(arguments[0]).click()", button)
# time.sleep(2)
# tree = etree.HTML(driver.page_source)
# supermarket = tree.xpath('//*[@id="around"]/div/div[2]/ul[2]/li/p[2]/text()')
# supermarket = str(supermarket).strip('[').strip(']')
data_list = [title, price, house_class, area, maintain, floor, parking_lot, use_cable, heating, point,
move_into_time, lift, use_water, gas, tenancy,
status1, status2, status3, status4, status5, status6, status7, status8, status9, status10,
metro, bus_route, school,hospital,url
]
print(data_list)
with open('used_url.txt','a+',encoding='utf-8') as fp:
fp.write(url+'\n')
except:
print(f'出错url:{url}')
return data_list
总结
点开主页可以下载全部代码跟数据进行参考!