爬虫python 英文_python爬虫采集可可英语the english we speak

[Python] 纯文本查看 复制代码import requests

from bs4 import BeautifulSoup

import re

from tqdm import tqdm

import time

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'

}

def remove_span_tag(tag):

content = str(tag)

treated_content = re.sub('','',content,flags=re.S+re.I)

result = BeautifulSoup(treated_content,'lxml')

return result

def remove_strong_tag(tag):

content = str(tag)

treated_content = re.sub('|','',content,flags=re.S+re.I)

result = BeautifulSoup(treated_content,'lxml')

return result

def remove2next1(string):

treated_content = re.sub('\n\n','\n',string,flags=re.S+re.I)

return treated_content

def change_br2next(tag):

content = str(tag)

# treated_content = re.sub('
','',content,flags=re.S+re.I)

treated_content = re.sub('
','\n',content,flags=re.S+re.I)

result = BeautifulSoup(treated_content,'lxml')

return result

def get_html(url):

response = requests.get(url,headers=headers)

if response.status_code==200:

response.encoding = 'utf-8'

# print(response.apparent_encoding)

return response.text

def parse_audio_text(html):

soup = BeautifulSoup(html,'lxml')

title = soup.select('div.f-title')[0].string

# print(title)

audio = soup.select('#show_mp3 > audio')[0].source['src']

# print(audio)

content = soup.select('#content > div > div.infoMain > div.f-y.w.hauto')[0]

texts =content.select('p')

# test = change_br2next(remove_strong_tag(remove_span_tag(texts[1])))

# print(test)

result = ''

for text in texts:

result+= change_br2next(remove_strong_tag(remove_span_tag(text))).get_text()

result_text = remove2next1(result)

# print(result_text)

return title,audio,result_text

# texts = remove_strong_tag(remove_span_tag(content.select('p')[1]))

def parse_index(html):

soup = BeautifulSoup(html,'lxml')

links = soup.select('.listItem')

srcs = []

for link in links:

src = link.select('a')[0]['href']

src = 'http://m.kekenet.com'+ src

srcs.append(src)

return srcs

def get_index(url):

response = requests.get(url,headers=headers)

if response.status_code==200:

response.encoding = 'utf-8'

# print(response.apparent_encoding)

return response.text

def save_text(title,content):

with open(title + '.txt','a',encoding='utf-8') as f:

f.write(content)

f.close()

def downloadFILE(url,name):

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'

}

resp = requests.get(url=url,stream=True,headers=headers)

content_size = int(int(resp.headers['Content-Length'])/1024)

with open(name, "wb") as f:

print("Pkg total size is:",content_size,'k,start...')

for data in tqdm(iterable=resp.iter_content(1024),total=content_size,unit='k',desc=name):

f.write(data)

print(name , "download finished!")

if __name__ == "__main__":

for i in range(1,24):

url = 'http://m.kekenet.com/menu/14439/List_{}.shtml'.format(str(i))

html = get_index(url)

srcs = parse_index(html)

# print(srcs)

print('list',i)

for src in srcs:

detial_html = get_html(src)

title,audio,result_text= parse_audio_text(detial_html)

title = re.search('第(.*?)期',title,re.S)

if title:

title = title.group(1).zfill(3)

print(audio)

print(result_text)

save_text(title,result_text)

downloadFILE(audio,title +'.mp3')

# 24链接http://m.kekenet.com/menu/14439/index.shtml

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值