用python3.5抓取内涵段子并且保存到本地_python 把网页抓到本地-CSDN博客

因为一些资料和代码是参考网上的。所以为转载

#coding=utf-8
import time
import re  
from bs4 import BeautifulSoup	#安装BeautifulSoup
from selenium import webdriver  #安装谷歌浏览器驱动 链接：http://pan.baidu.com/s/1i5Kgvjf 密码：3xhd

path = "D:\chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=path)
driver.get('https://neihanshequ.com/')
for i in range (100):
    driver.find_element_by_id("loadMore").click() #点击加载更多
    time.sleep(1)

html = driver.page_source   #获取网页文本
print(html)
def getText(html):
   # ------ 利用正则表达式匹配网页内容找到wenzi ------
     reg = r'<div class="upload-txt.*?">(.*?)</div>'

     pattern= re.compile(reg,re.S);
     result = re.findall(pattern, html)
     return result
pag = getText(html)

with open('E:\内涵段子\\201792.txt', 'w', encoding='utf-8') as f:
    for each in pag:
        if '<br />' in each:
            # 替换成换行符并输出
            new_each = re.sub(r'<br />', '\n', each)
            f.write(new_each)
            print(new_each)
            # 没有就照常输出
        else:
            f.write(str(each) + '\n')
print('All done')
#driver.quit()