python爬虫的简单项目之爬取成语

from selenium import webdriver
from idiom import DbHandle

option = webdriver.ChromeOptions()
option.add_argument('headless')
web = webdriver.Chrome(options=option)
web.get("http://xh.5156edu.com/")
web.find_element_by_link_text("按拼音检索").click()
db = DbHandle.DbHandle()
sql = "SELECT id,body FROM py_body"
print(sql)
list = db.select_all(sql)
print(list)
for one in list:
    print('-----111111----')
    print(one['body'])  #拼音
    print(one['id'])    #拼音ID
    web.find_element_by_link_text(one['body']).click()
    div = web.find_elements_by_xpath('//tr[@bgcolor="#ffffff"]')    #每个音调的标签块
    tone = 0
    for two in div:
        tone = tone + 1
        py1 = two.find_element_by_class_name('font_14')
        print(py1.text+'----------000-----'+str(tone))      #音调
        word = two.find_elements_by_class_name('fontbox')   #汉字的a标签
        for py in word:
            print(py.text[0]+'-------9---')                 #汉字
            sql = "INSERT INTO chinese_characters (pin_yin_id,word,pin_yin,tone) " \
                  "VALUES ('%s','%s','%s','%s')" % (one['id'], py.text[0], py1.text, tone)
            print(sql)
            db.update(sql)
    web.back()
from selenium import webdriver
from word import DbHandel

option = webdriver.ChromeOptions()
option.add_argument('headless')
web = webdriver.Chrome(options=option)
web.get("http://xh.5156edu.com/")
web.find_element_by_link_text("按拼音检索").click()
# list = web.find_elements_by_tag_name('p')
data = "abcdefghijklmnopqrstuvwxyz"
db = DbHandel.DbHandle()
sql = "SELECT id,head FROM py_head"
print(sql)
list = db.select_all(sql)
print(list)
for one in list:
    print('-----111111----')
    print(one['head'])
    print(one['id'])
    condition = '//a[starts-with(text(), "' + one['head'] + '")]'
    body = web.find_elements_by_xpath(condition)
    for one1 in body:
        sql = "INSERT INTO py_body (body,head_id) VALUES ('%s','%s')" % (one1.text,one['id'])
        print(sql)
        db.update(sql)
        print(one1.text + '---')
from selenium import webdriver
from word import DbHandel

option = webdriver.ChromeOptions()
option.add_argument('headless')
web = webdriver.Chrome(options=option)
web.get("http://xh.5156edu.com/")
web.find_element_by_link_text("按拼音检索").click()
list = web.find_elements_by_tag_name('p')
data = "abcdefghijklmnopqrstuvwxyz"
db = DbHandel.DbHandle()
for one in list:
    text = str.lower(one.text)
    if text == '':
        continue
    if text in data:
        # condition = '//a[starts-with(text(), "a")]'
        # '//a[contains(text(), "a")]'   # '//a[text()="a"]'   # '//*[text()="a"]'   #'//div[contains(@style,"sp.gif")]'
        print('-----111111----')
        print(text)
        condition = '//a[starts-with(text(), "'+text+'")]'
        body = web.find_elements_by_xpath(condition)
        for one1 in body:
            print(one1.text + '---')
        # sql = "INSERT INTO py_head (head) VALUES ('%s')" % text
        # print(sql)
        # db.update(sql)
        # print(text)
from selenium import webdriver
from idiom import DbHandle

option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(options=option)
driver.get("http://cy.5156edu.com/")
db = DbHandle.DbHandle()
sql = 'SELECT id,word FROM chinese_characters'
data = db.select_all(sql)
for one in data:
    print('----'+str(one['id'])+'-----'+one['word']+'------')
    # print('----' + str(one) + '-----')
    search = driver.find_element_by_id('_SearchString')
    search.clear()
    search.send_keys(one['word'])
    # search.send_keys(one)
    select = driver.find_element_by_name('f_type2')
    select.find_elements_by_tag_name('option')[1].click()
    driver.find_element_by_xpath('//input[@value="查词典"]').click()
    idiom = driver.find_elements_by_tag_name('u')
    if len(idiom) == 0:
        continue
    print('--------------')
    for two in idiom:
        result = two.text
        if result[0] != one['word']:
            break
        form = 9
        if len(result) == 4:
            form = 0
            if result[0] == result[1]:
                form = 1
                if result[2] == result[3]:
                    form = 2
            elif result[0] == result[2]:
                form = 3
                if result[1] == result[3]:
                    form = 4
            elif result[0] == result[3]:
                form = 5
            elif result[1] == result[2]:
                form = 6
            elif result[1] == result[3]:
                form = 7
            elif result[2] == result[3]:
                form = 8
        sql = "INSERT INTO chinese_idiom (idiom,first_word_id,form_type) " \
              "VALUES ('%s','%s','%s')" % (result, one['id'], form)
        print(sql)
        db.update(sql)
        print(result + '------------'+str(form))

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值