写了一些爬虫
希望各位老师能多多给出一些代码上的优化建议和对后面学习上的建议指导。
1.yxlm官网英雄皮肤原画下载链接获取
# 导入浏览器驱动模块
import time
import csv
from selenium import webdriver
# 我们现在安装的是selenium4.x版本,4.x版本要求浏览器驱动使用服务方法导入
from selenium.webdriver.chrome.service import Service
# 导入定位方式包
from selenium.webdriver.common.by import By
import re
# 创建浏览器配置对象
options = webdriver.ChromeOptions()
# 不加载图片,提升速度
options.add_argument('blink-settings=imagesEnabled=false')
options.add_experimental_option('detach', True)
s = Service(executable_path='./chromedriver.exe')
# 将配置添加到浏览器对象中
browser = webdriver.Chrome(service=s, options=options)
file = open('英雄皮肤链接.csv', 'w', encoding='utf-8', newline='')
url = 'https://101.qq.com/#/hero'
browser.get(url)
time.sleep(3)
li_list = browser.find_elements(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[2]/ul/li')
# 获取英雄称号及英雄链接
hero_name_link_dict = {}
for i in li_list:
hero_name = i.find_element(By.XPATH, './div/p').text
hero_link = i.find_element(By.XPATH, './a').get_attribute('href')
hero_name_link_dict.setdefault(hero_name, hero_link)
# 蝎子的网页有问题,资料站里他的页面会弹出无攻略的界面,干脆不爬蝎子了
del hero_name_link_dict['水晶先锋']
# 乌迪尔也没有攻略,又给我爬虫打断了
del hero_name_link_dict['兽灵行者']
# 牛逼,龙女也没有
del hero_name_link_dict['龙血武姬']
# print(hero_link_list)
def get_skin_link(name,link):
name_skin_dict = {}
link_list = []
browser.get(link)
time.sleep(2)
# print(browser.page_source)
browser.find_element(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[2]/div[1]/div[1]/div[5]').click()
time.sleep(2)
div_list = browser.find_elements(By.XPATH, '/html/body/div[1]/div/div[3]/div/div[2]/div[3]/div[3]/div/div[2]/div[2]/div/div')
for i in div_list:
skin_link = i.find_element(By.XPATH, './img').get_attribute('src')
link_list.append(skin_link)
name_skin_dict.setdefault(name, link_list)
csv.writer(file).writerow([name_skin_dict])
return name_skin_dict
for key, value in hero_name_link_dict.items():
result = get_skin_link(key, value)
print(result)
file.close()
print('写入完成')
机械结构越复杂,就越容易发生故障,我觉得写代码也是一样的,所以,我的皮肤链接获取和使用皮肤链接下载是分开写的,代码越长,我觉得越容易出问题,所以直接分开写,以下是读取csv文件来实现下载操作的代码。
import requests
import os.path
file = open('英雄皮肤链接.csv', 'rt', encoding='utf-8')
data = file.readlines()
key_list = []
# 先读取每一个英雄的名称
link_list = []
for i in data:
dict_str = eval(i)
dict_1 = eval(dict_str)
link_list.append(dict_1)
for key in dict_1:
key_list.append(key)
# print(link_list)
for i in range(len(key_list)):
hero_link = link_list[i][key_list[i]]
hero_name = key_list[i]
for link in hero_link:
print(link)
response = requests.get(link)
photo = response.content
path = f'英雄皮肤汇总/{hero_name}'
if not os.path.isdir(path):
os.mkdir(path)
with open(f'{path}/{hero_name}{str(link[-5:-8:-1])}.jpg', 'wb') as p:
p.write(photo)
print('--------------------------分割线--------------------------')
print('下载完成')
2.boss直聘cookie登录再爬取(没有写数据持久化)
遇到需要登录才能搜索内容的网页很烦,要涉及到一系列的验证码破解,使用cookie登录真的是一个很巧妙的方法。
这里cookie获取使用了多次显式等待,boss直聘这个网站我爬的时候不知道什么原因,selenium打开的谷歌浏览器刷新一次,他的网页会刷新很多次,导致我多次找不到元素,使用显式等待不仅实现了半自动登录操作,还很巧妙的绕开了网页多次刷新的操作,说实话我也不明白它多次刷新是谷歌浏览器的问题还是boss直聘的反爬又或者是selenium的问题。
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
# 从webdriver-manager的chrome包中导入ChromeDriverManager方法
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# ChromeDriverManager().install():自动检测电脑上的Chrome浏览器的版本,下载对应驱动
driver_path = '../'
# 引入配置项
s = ChromeService(ChromeDriverManager(path=driver_path).install())
Options = webdriver.ChromeOptions()
# 引入不关闭浏览器的相关配置项
Options.add_experimental_option("detach", True)
# 避免终端下执行代码报警告
Options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# 窗口最大化
Options.add_argument("--start-maximized")
# 调用开发者模式,解决window.navigator.webdrive问题
Options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(service=s, options=Options)
# 网页对selenium做检测,检测代码存在于网页的JavaScript代码中
actions = ActionChains(browser)
url = 'https://www.zhipin.com/'
browser.get(url)
# 点击登录并输入手机号
browser.find_element(By.XPATH, '//*[@id="header"]/div[1]/div[4]/div/a[4]').click()
time.sleep(1)
input_filed = browser.find_element(By.XPATH, '//*[@id="wrap"]/div/div[2]/div[2]/div[2]/div[1]/div[1]/div/span[2]/input')
input_filed.send_keys('13002899781')
# 手动点击发送验证码,手动输入验证码实现登录,做一个显式等待
flag = EC.text_to_be_present_in_element(
(By.XPATH, '/html/body/div[1]/div[1]/div/div/div[1]/div[3]/ul/li[5]/a/span'),
'程树伟'
)
WebDriverWait(browser, 90).until(flag)
# 获取cookie
my_cookie = browser.get_cookies()
with open('cookies.txt', 'w') as file:
file.write(str(my_cookie))
使用cookie登录并开始爬取,顺便复习了一下正则表达式
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
# 从webdriver-manager的chrome包中导入ChromeDriverManager方法
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
# ChromeDriverManager().install():自动检测电脑上的Chrome浏览器的版本,下载对应驱动
driver_path = '../'
# 引入配置项
s = ChromeService(ChromeDriverManager(path=driver_path).install())
Options = webdriver.ChromeOptions()
# 引入不关闭浏览器的相关配置项
Options.add_experimental_option("detach", True)
# 避免终端下执行代码报警告
Options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# 窗口最大化
Options.add_argument("--start-maximized")
# 调用开发者模式,解决window.navigator.webdrive问题
Options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(service=s, options=Options)
# 网页对selenium做检测,检测代码存在于网页的JavaScript代码中
actions = ActionChains(browser)
url = 'https://www.zhipin.com/'
browser.get(url)
# 读取cookies
with open('cookies.txt', 'r') as file:
cookies = eval(file.read())
# 写入cookies并刷新页面
for i in cookies:
browser.add_cookie(i)
browser.refresh()
# 不知道是什么反爬机制,刷新一次一直刷新,没刷新完元素都找不到,上一个显式等待
flag_1 = EC.text_to_be_present_in_element(
(By.XPATH, '//*[@id="header"]/div[1]/div[4]/ul/li[5]/a/span'),
'程树伟'
)
WebDriverWait(browser, 90).until(flag_1)
# 输入搜索岗位,进入岗位列表界面
input_filed = browser.find_element(By.XPATH, '/html/body/div[1]/div[3]/div/div[1]/div[1]/form/div[2]/p/input')
input_filed.send_keys('数据分析')
time.sleep(0.1)
browser.find_element(By.XPATH, '//*[@id="wrap"]/div[3]/div/div[1]/div[1]/form/button').click()
# 还会刷新,再次使用显式等待
flag_2 = EC.text_to_be_present_in_element(
(By.XPATH, '//*[@id="header"]/div[1]/div[3]/ul/li[5]/a/span'),
'程树伟'
)
WebDriverWait(browser, 90).until(flag_2)
# 使用正则表达式进行数据提取
time.sleep(10)
content = browser.page_source
job_name_re = '<span class="job-name">.*?</span><span'
job_money_re = '<span class="salary">.*?</span>'
job_link_re = '<a href="/job_detail.*?"'
job_name_list = re.findall(job_name_re, content, flags=re.S)
job_money_list = re.findall(job_money_re, content, flags=re.S)
job_link_list = re.findall(job_link_re, content, flags=re.S)
job_name_re = '<span class="job-name">|</span><span'
job_money_re = '<span class="salary">|</span>'
job_link_re = '<a href="|"'
job_name_result = []
job_money_result = []
job_link_result = []
for i in job_name_list:
job_name = re.sub(job_name_re, '', i)
job_name_result.append(job_name)
for i in job_money_list:
job_money = re.sub(job_money_re, '', i)
job_money_result.append(job_money)
for i in job_link_list:
job_link = 'https://www.zhipin.com' + re.sub(job_link_re, '', i)
job_link_result.append(job_link)
print(job_name_result, job_money_result, job_link_result)
3.LianJia大爬虫
大爬虫之所以叫它大爬虫,就是因为它真的挺大的,链家上每个区,每个区的每一页都爬取到了,这个爬虫之前是拿来做过练习的,老师讲过,现在属于是自己构思写一遍又有了新的理解。
总之都是一个反复学习的过程。
import csv
import json
import requests
from lxml import etree
import os.path
from tqdm import tqdm
# 请求网站并得到源码
def request_get(link):
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39'
}
response = requests.get(url=link, headers=Headers)
return response.text
# 获取各区名称及链接构造字典
def get_region_link(html_source):
region_dict = {}
root = etree.HTML(html_source)
region_list = root.xpath('/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div/a')
for i in region_list:
region_name = i.xpath('./text()')[0]
region_link_last = i.xpath('./@href')[0]
region_link = 'https://cd.lianjia.com/' + str(region_link_last)
region_dict.setdefault(str(region_name), region_link)
return region_dict
# 获取各区页面的页数
def get_region_pages(region_dict):
page_num_dict = {}
for key, value in region_dict.items():
html_source = request_get(value)
root = etree.HTML(html_source)
pages = root.xpath('./body/div[4]/div[1]/div[@class="contentBottom clear"]/div[2]/div/@page-data')[0]
page_num_dict.setdefault(key, json.loads(pages)['totalPage'])
return page_num_dict
# 开始爬取内容并实现数据持久化
def get_house_info(region_dict, page_num_dict):
for key, links in region_dict.items():
house_info = []
page_number = page_num_dict[key]
for i in tqdm(range(1, page_number+1), desc=f'{key}区进度'):
page_link = f'{links} + pg{i}/'
html_source = request_get(page_link)
root = etree.HTML(html_source)
li_list = root.xpath('//*[@id="content"]/div[1]/ul/li')
for j in li_list:
house_title = j.xpath('./div[1]/div[1]/a/text()')[0]
house_link = j.xpath('./a/@href')[0]
house_info.append([house_title, house_link])
# print(house_info)
write_to_csv(key, house_info)
# 数据持久化写入csv文件
def write_to_csv(key, house_info):
path = '链家各区房屋及链接'
if not os.path.isdir(path):
os.mkdir(path)
with open(f'./{path}/成都链家{key}区二手房数据.csv', 'w', encoding='utf-8', newline='') as file:
csv.writer(file).writerow(['房屋标题', '房屋链接'])
for i in house_info:
csv.writer(file).writerow(i)
def main(url):
html_source = request_get(url)
region_dict = get_region_link(html_source)
# print(region_dict)
page_num_dict = get_region_pages(region_dict)
# print(page_num_dict)
get_house_info(region_dict, page_num_dict)
if __name__ == 'main':
url = 'https://cd.lianjia.com/ershoufang/'
main(url)
爬取结果过不了审,自己拿代码试一下吧。
大家一起加油吧!