自从上次爬取iciba上的单词发音文件之后,今天再测试,居然发现爬取不到了,研究了一下发现网站改变了其网页的代码,没关系,爬与反爬是永远的主题,这次更新了源码,顺便将爬取到的文件存到我的hadoop集群上。
写hadoop集群用了两种方法,分别用到hdfs和pyhdfs,供大家参考。
不废话,上源码。
# 抓取iciba网站上的发音文件并存储到hadoop系统中
import requests
import re
import random
from hdfs import InsecureClient
import pyhdfs
def donwload_voice(word):
uapools = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"
"Mozilla/5.0 (Windstows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
] # 浏览器伪装
headers = {"User-Agent": random.choice(uapools)}
r = requests.get('http://www.iciba.com/word?w='+word,headers = headers)
# pattern = re.compile(r'<i class="new-speak-step" ms-on-mouseover="sound\(\'(.*?)\'\)"></i>',re.S) #语音文件地址正则
print(r.url)
# "ph_en_mp3": "http://res.iciba.com/resource/amp3/oxford/0/07/a4/07a464945dda3d310b26995258d9a88a.mp3"
# "ph_en_mp3": "http://res.iciba.com/resource/amp3/oxford/0/1c/f3/1cf3980c4529878b690ded143c409664.mp3"
# "ph_am_mp3": "http://res.iciba.com/resource/amp3/1/0/76/80/7680edae4d6618e8fe00990c9f628966.mp3"
pattern_en = re.compile(r'\"ph_en_mp3\"\:\"(.*?)\"', re.S) # 语音文件地址正则
pattern_us = re.compile(r'\"ph_am_mp3\"\:\"(.*?)\"', re.S)
print(pattern_en)
voice_en = re.findall(pattern_en,r.text) #找到语音文件地址
voice_us = re.findall(pattern_us, r.text)
voi_en = requests.get(voice_en[0], headers = headers)
voi_us = requests.get(voice_us[0], headers = headers)
#将文件写入我的hadoop hdfs中: 使用hdfs包
# try:
# root_path = "/"
# c = InsecureClient(url="http://192.168.20.129:50070", user='chengwy', root=root_path) #建立hadoop客户端链接
# hd_file_name_en = '/user/chengwy/mp3/' + word + '_en.mp3'
# hd_file_name_us = '/user/chengwy/mp3/' + word + '_us.mp3'
# print(hd_file_name_en,hd_file_name_us)
# c.write(hd_file_name_en, voi_en.content, True)
# c.write(hd_file_name_us, voi_us.content, True)
# print('----success----')
# except Exception as e:
# print(e)
#将文件写入我的hadoop hdfs中: 使用pyhdfs包
try:
client = pyhdfs.HdfsClient(hosts="192.168.20.129:50070", user_name="chengwy")
hd_file_name_en = '/user/chengwy/mp3/' + word + '_en.mp3'
hd_file_name_us = '/user/chengwy/mp3/' + word + '_us.mp3'
print(hd_file_name_en,hd_file_name_us)
client.create(hd_file_name_en,voi_en.content,overwrite=True)
client.create(hd_file_name_us, voi_us.content, overwrite=True)
print('----success----')
except Exception as e:
print(e)
donwload_voice('path') #开始愉快地玩耍吧