展开阅读全文代码html,展开阅读全文 js 爬虫操作

from selenium importwebdriverimporttimeimportrandomfrom bs4 import *browser=webdriver.Chrome()

url= 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'browser.get(url)

ck_l_ori_len= len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

ck_l_ori_ok=0try:for isc in range(100):if ck_l_ori_ok ==ck_l_ori_len:breaktime.sleep(1)

js= 'window.scrollTo(0,document.body.scrollHeight)'js= 'window.scrollTo(0,100*{})'.format(isc)

browser.execute_script(js)

ck_l= browser.find_elements_by_link_text('展开阅读全文 ∨')for i inck_l:try:

i.click()

ck_l_ori_ok+= 1

exceptException as e:print(e)exceptException as e:print('window.scrollTo-->', e)#ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨')#for i in ck_l:#try:#i.click()#except Exception as e:#print(e)

xp_l= ['//*[@id="fanyi967"]/div/div[3]/a', ]

myhtml= 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))

with open(myhtml,'w', encoding='utf-8') as fw:

fw.write(browser.page_source)

sql= 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES'with open(myhtml,'r', encoding='utf-8') as myhtml_o:

bs= BeautifulSoup(myhtml_o, 'html.parser')

dd= 9

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i

from selenium importwebdriverimporttimeimportrandomfrom bs4 import *browser=webdriver.Chrome()

url= 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'browser.get(url)#ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))#ck_l_ori_ok = 0#try:#for isc in range(100):#if ck_l_ori_ok == ck_l_ori_len:#break#time.sleep(1)#js = 'window.scrollTo(0,document.body.scrollHeight)'#js = 'window.scrollTo(0,100*{})'.format(isc)#browser.execute_script(js)#ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')#for i in ck_l:#try:#i.click()#ck_l_ori_ok += 1#except Exception as e:#print(e)#except Exception as e:#print('window.scrollTo-->', e)

js= "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i

try:

browser.execute_script(js)exceptException as e:print(e)

ck_l_ori_len= len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

ck_l_ori_ok=0try:for isc in range(100):if ck_l_ori_ok ==ck_l_ori_len:breaktime.sleep(1)

js= 'window.scrollTo(0,document.body.scrollHeight)'js= 'window.scrollTo(0,100*{})'.format(isc)

browser.execute_script(js)

ck_l= browser.find_elements_by_link_text('展开阅读全文 ∨')for i inck_l:try:

i.click()

ck_l_ori_ok+= 1

exceptException as e:print(e)exceptException as e:print('window.scrollTo-->', e)

from selenium import webdriver

import time

import random

from bs4 import *

from pyquery import PyQuery as pq

browser = webdriver.Chrome()

url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'

browser.get(url)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i

try:

browser.execute_script(js)

except Exception as e:

print(e)

ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))

ck_l_ori_ok = 0

try:

for isc in range(100):

if ck_l_ori_ok == ck_l_ori_len:

break

time.sleep(1)

js = 'window.scrollTo(0,document.body.scrollHeight)'

js = 'window.scrollTo(0,100*{})'.format(isc)

browser.execute_script(js)

ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')

for i in ck_l:

try:

i.click()

ck_l_ori_ok += 1

except Exception as e:

print(e)

except Exception as e:

print('window.scrollTo-->', e)

doc = pq(browser.page_source)

pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}

r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''

article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)

title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}

author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}

translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)

explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)

refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)

author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]

d = 4

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值