A rookie of python_crawler----2(dict)

用crwaler做了个字典,可以中翻英,英翻中,还有例句什么的,做了个简单的实验

# A simple dict made by crawler, supporting Chiners->English and English->Chinese
import requests
import re
from bs4 import BeautifulSoup

browser = requests.Session()


def build_url():
    a = 0
    mesg = input("PLease input:")
    if re.search('^[A-Za-z].*', mesg):
        a = 1
    mesg = mesg.encode()
    mesg = (str(mesg).replace(r'\x', '%'))[2:-1]
    return mesg, a


def CH_EN(addr):
    # browser=requests.Session()
    # http://dict.cn/%E8%8B%B9%E6%9E%9C  apple
    # http://dict.cn/%E6%B0%B4%E6%9E%9C  fruit
    url = 'http://dict.cn/' + addr
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Geck    o) '
                      'Chrome/70.0.3538.77 Safari/537.36'
    }

    response = browser.get(url, headers=headers)

    dic = BeautifulSoup(response.text.replace('<br>', '').replace('<br/>', ''), 'html.parser')

    ch_label = dic.find('h1', class_='keyword')  # 中文标签
    print(ch_label.get_text())

    # 基本释义
    en_meaning = dic.find_all('a', href=re.compile(r'http://dict.cn/[a-z]'))
    print('------------')
    print("基本释义")
    for mean in en_meaning:
        # print(mean['href'])
        result = re.search('/dir/', mean['href'])
        result2 = re.search('/jp/', mean['href'])
        result3 = re.search('/de/', mean['href'])
        result4 = re.search('/fr/', mean['href'])
        result5 = re.search('/kr/', mean['href'])
        result6 = re.search('/es/', mean['href'])
        result7 = re.search('/it/', mean['href'])
        result8 = re.search('/ru/', mean['href'])
        result9 = re.search('/list/yinbiao', mean['href'])
        # print(result)
        if result or result2 or result3 or result4 or result5 or result6 or result7 or result8 or result9:
            continue
        else:
            print((mean.get_text()).strip())

    # 例句
    print('------------')
    print("例句")
    sentence = dic.find_all('ol', slider='2')
    for each in sentence:
        each_2 = each.find_all('li')
        for each_3 in each_2:
            response = each_3.string
            key = re.findall('(?<=\t\t\t\t\t)(.*)(?=\t\t\t\t\t)', response)
            for i in key:
                print(i)

    return


def EN_CH(addr):
    # browser=requests.Session()
    # http://dict.cn/%E8%8B%B9%E6%9E%9C  apple
    # http://dict.cn/%E6%B0%B4%E6%9E%9C  fruit
    url = 'http://dict.cn/' + addr
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Geck    o) '
                      'Chrome/70.0.3538.77 Safari/537    .36'
    }

    response = browser.get(url, headers=headers)
    dic = BeautifulSoup(response.text.replace('<br>', '').replace('<br/>', ''), 'html.parser')

    en_label = dic.find('h1', class_='keyword')  # 英文标签
    print(en_label.get_text())

    # 释义
    print('------------')
    print("释义")
    mean = dic.find_all('li')
    for each in mean:
        each_2 = each.find('span')
        each_3 = each.find('strong')
        if not each_2:
            break
        else:
            print(each_2.get_text() + '\t' + each_3.get_text())

    # 例句
    print('------------')
    print("例句")
    mean = dic.find_all('ol', slider='2')
    for each in mean:
        if not each.string:
            each_2 = each.find_all('li')
            for each_3 in each_2:
                if not each_3.find('strong'):
                    if re.search('^[A-Za-z].*', each_3.string):
                        print(each_3.string + '\n')


# main
url_addr, flag = build_url()

if flag == 1:
    EN_CH(url_addr)
else:
    CH_EN(url_addr)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值