# -*- coding:utf-8 -*-
from requests_html import HTMLSession
import requests_html
import io
import sys
import urllib.request
import os,re
import pandas as pd
import numpy as np
# 去除文章标题里的特殊符合
def qufh(str):
str1=re.sub("[\s+\.\!\/\\_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()》《><|::|]+", "", str)
return str1
def crawl(url):
session = HTMLSession()
r = session.get(url)
introStr=''
titleStr=''
contentList=list()
# 通过CSS找到标签内容
new_title= r.html.find('title')
for title in new_title:
titleStr=title.text
titleStr=titleStr.split(" - 春雨医生")[0] #标题去掉作者 (即去掉: - 春雨医生)
print('titleStr---:', titleStr)
new_intro = r.html.find('div.news-intro')
for intro in new_intro:
introStr = intro.text
new_content= r.html.find('div.news-content')
for content in new_content:
contentList.append(content.text)
contentStr = "".join(contentList)
try:
name=qufh(titleStr)+".txt" #文件命名
print(name)
f = open(name,'w',encoding="utf-8")
f.write(titleStr+'\n'+introStr+'\n'+contentStr)
f.close() #关闭资源
return name
except:
print(url)
print("文件命名错误或文件操作有错")
def main(data,src,dirname):#输入数据,指定一个存放路径,在指定路径下新建文件名(将爬取的文件放在这个文件夹中)
dataList=np.array(data).tolist()
os.chdir(src) # 切换目录到 E:\limeng
if not os.path.exists(dirname):
os.makedirs(dirname)
os.chdir(src+dirname) # 切换目录到 E:\limeng\0-10 下
file_name=dirname+".csv"
write_file_name = open(file_name, 'w', encoding="utf-8")
for i in dataList:
try:
write_file_name.write(i[0]+","+crawl(i[0]) + '\n') #把文章的名字写到指定的txt中
except Exception as err:
print(err)
#continue
#主程序------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data=pd.read_csv("testurl.csv",header=None, delim_whitespace=True)
main(data,'E:\\','article1')
网站文本爬虫
最新推荐文章于 2022-01-14 09:54:49 发布