因为一些资料和代码是参考网上的。所以为转载
#coding=utf-8 import time import re from bs4 import BeautifulSoup #安装BeautifulSoup from selenium import webdriver #安装谷歌浏览器驱动 链接:http://pan.baidu.com/s/1i5Kgvjf 密码:3xhd
path = "D:\chromedriver\chromedriver.exe" driver = webdriver.Chrome(executable_path=path) driver.get('https://neihanshequ.com/') for i in range (100): driver.find_element_by_id("loadMore").click() #点击加载更多 time.sleep(1) html = driver.page_source #获取网页文本 print(html) def getText(html): # ------ 利用正则表达式匹配网页内容找到wenzi ------ reg = r'<div class="upload-txt.*?">(.*?)</div>' pattern= re.compile(reg,re.S); result = re.findall(pattern, html) return result pag = getText(html) with open('E:\内涵段子\\201792.txt', 'w', encoding='utf-8') as f: for each in pag: if '<br />' in each: # 替换成换行符并输出 new_each = re.sub(r'<br />', '\n', each) f.write(new_each) print(new_each) # 没有就照常输出 else: f.write(str(each) + '\n') print('All done') #driver.quit()