from selenium import webdriver from lxml import etree from time import sleep import requests if __name__ == '__main__': # 打开浏览器创建浏览器对象b b=webdriver.Chrome() # 最大化窗口全屏 b.maximize_window() # 输入网址,回车发送请求 b.get('http://spbfy.chinacourt.gov.cn/index.shtml') sleep(1) # 源码输出 print(b.page_source) html=etree.HTML(b.page_source) names=html.xpath('//*[@id="layout"]/div[7]/div[2]/div[2]/div[2]/ul/li[6]/span/a') print() # 进入点击页面 js='document.querySelector("#layout > div.index_right > div:nth-child(2) > div.list_br > div.list.dian_a.font14 > ul > li:nth-child(6) > span > a").click();' b.execute_script(js) sleep(5) # 爬取内容 html = requests.get("http://spbfy.chinacourt.gov.cn/article/detail/2017/03/id/3201603.shtml") # print html.text etree_html = etree.HTML(html.text) content = etree_html.xpath('//*[@id="container"]/div//text()') print(len(content)) for each in content: replace = each.replace('\n', '').replace(' ', '') if replace == '\n' or replace == '': continue else: print(replace)
python之selenium连接浏览器自动爬取内容
最新推荐文章于 2023-05-17 15:52:11 发布