系列目录
功能介绍
具有定时采集新闻内容的网络爬虫,只需要配置间隔时间即可自动爬取新闻,新闻采集源为新浪新闻
一、结构
新闻爬虫包括两部分:URL采集器、详情页采集器、定时器
采集器 | 功能 |
---|---|
URL采集器 | 专门用于采集新浪滚动新闻上的新闻详情页URL |
详情页采集器 | 通过URL采集到的URL数据进行详情页面的内容采集(例如:新闻) |
定时采集器 | 控制新闻采集器的启动和关闭,以及定时任务的设定 |
二、具体实现
1.URL采集器
'''
使用新浪新闻滚动新闻的API进行新闻采集
参数分析:
pageid 目前看应该是固定的参数默认值为153
lid 类别ID 2509(全部) 2510(国内) 2511(国际) 2669(国际) 2512(体育) 2513(娱乐) 2514(军事) 2515(科技) 2516(财经) 2517(股市) 2518(美股)
num 获取新闻数量 上限为50
'''
def urlcollect(lid):
op_mysql = OperationMysql() #创建数据库连接对象
url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid='+str(lid)+'&num=50' #网易新闻API
result = requests.get(url) #对API发起请求
result.encoding = 'utf-8' #由于API返回的数据为ISO编码的,中文在此处显示会出现乱码,因此更改为UTF-8编码
# print('Web:', result.text)
urls = re.findall(r'"url":"(.*?)"', result.text) #获取API返回结果中的所有新闻详情页URL
# times = re.findall(r'"ctime":"(.*?)"', result.text)
# 逐条处理被\转义的字符,使之成为为转义的字符串
# 并把处理号的URL导入到数据库中储存
changedict = {"2518": 0, "2510": 1, "2511": 2, "2669": 3, "2512": 4, "2513": 5, "2514": 6, "2515": 7, "2516": 8, "2517": 9}
Type = changedict.get(str(lid))
for numbers in range(len(urls)):
urls[numbers] = urls[numbers].replace('\\', '')
logger.info("url:{}".format(urls[numbers]))
time = datetime.datetime.now().strftime('%Y-%m-%d')
sql_i = "INSERT INTO news_api_urlcollect(url, type, time) values ('%s', '%d', '%s')" % (urls[numbers], Type, time)
op_mysql.insert_one(sql_i)
op_mysql.conn.close()
2.详情页采集器
def getnewsdetail(url):
# 获取页面上的详情内容并将详细的内容汇集在news集合中
result = requests.get(url)
result.encoding = 'utf-8'
soup = BeautifulSoup(result.content, features="html.parser")
title = getnewstitle(soup)
if title == None:
return None
date = getnewsdate(soup)
mainpage, orimainpage = getmainpage(soup)
if mainpage == None:
return None
pic_url = getnewspic_url(soup)
videourl = getvideourl(url)
news = {'mainpage': mainpage,
'pic_url': pic_url,
'title': title,
'date': date,
'videourl': videourl,
'origin': orimainpage,
}
return news
def getmainpage(soup):
'''
@Description:获取正文部分的p标签内容,网易对正文部分的内容通过文本前部的空白进行标识\u3000
@:param None
'''
if soup.find('div', id='article') != None:
soup = soup.find('div', id='article')
p = soup.find_all('p')
for numbers in range(len(p)):
p[numbers] = p[numbers].get_text().replace("\u3000", "").replace("\xa0", "").replace("新浪", "新闻")
text_all = ""
for each in p:
text_all += each
logger.info("mainpage:{}".format(text_all))
return text_all, p
elif soup.find('div', id='artibody') != None:
soup = soup.find('div', id='artibody')
p = soup.find_all('p')
for numbers in range(len(p)):
p[numbers] = p[numbers].get_text().replace("\u3000", "").replace("\xa0", "").replace("新浪", "新闻")
text_all = ""
for each in p:
text_all += each
logger.info("mainpage:{}" + text_all)
return text_all, p
else:
return None, None
def getnewspic_url(soup):
'''
@Description:获取正文部分的pic内容,网易对正文部分的图片内容通过div中class属性为“img_wrapper”
@:param None
'''
pic = soup.find_all('div', class_='img_wrapper')
pic_url = re.findall('src="(.*?)"', str(pic))
for numbers in range(len(pic_url)):
pic_url[numbers] = pic_url[numbers].replace("//", 'https://')
logging.info("pic_url:{}".format(pic_url))
return pic_url
def getnewsdate(soup):
'''
@Description:获取新闻的发布时间,网易对新闻的发布时间使用span的class属性为“date”
@:param None
'''
if soup.find('span', class_='date') != None:
date = str(soup.find('span', class_='date').text)
else:
date = str(soup.find('span', id="pub_date").text)
logger.info("date:{}".format(date))
return date
def getnewstitle(soup):
'''
@Description:获取新闻的标题,网易对新闻的标题使用h1的class属性为“main-title”
@:param None
'''
if soup.find('h1', class_='main-title') != None:
title = soup.find('h1', class_='main-title').text
elif soup.find('h1', id='artibodyTitle') != None:
title = soup.find('h1', id='artibodyTitle').text
else:
return None
logger.info("title:{}".format(title))
return title
def getvideourl(url):
'''
@Description:获取新闻的视频,使用webdriver.Chrome的无头模式进行页面的完整加载,从而获取到对应的src连接
@:param None
'''
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path='C:\Program Files\Google\Chrome\Application\chromedriver.exe',
options=chrome_options)
driver.get(url)
regex1 = re.compile('playsinline="playsinline" src="(.*?)"')
video_url = regex1.findall(driver.page_source)
for numbers in range(len(video_url)):
video_url[numbers] = video_url[numbers].replace("amp;", "")
except Exception:
video_url = []
return video_url
def getdatabaseurl():
'''
@Description:获取数据库中的所有未进行详情内容爬取的URL
@:param None
'''
op_mysql = OperationMysql()
searchresult = op_mysql.search_all('select url, type from news_api_urlcollect where handle=0')
op_mysql.conn.close()
if len(searchresult) == 0:
logger.warning(" No such url to get detail")
return None
else:
logger.info("Got All Url")
return searchresult
def insertdatabase(news, geturl, Type):
'''
@Description:将爬取到的页面详情内容存入数据库中
@:param None
'''
op_mysql = OperationMysql()
url = geturl['url']
title = str(news['title'])
date = str(news['date'])
pic_url = str(news['pic_url'])
videourl = str(news['videourl'])
mainpage = str(news['mainpage'])
orimainpage = str(news['origin'])
sql = 'insert into news_api_newsdetail(url, title, date, pic_url, videourl, mainpage, category, readnum, comments, origin) values ("%s", "%s", "%s", "%s", "%s", "%s", %d, 0, 0, "%s")' % (
url, title, date, pic_url, videourl, mainpage, Type, orimainpage)
try:
op_mysql.insert_one(sql)
sql = 'update news_api_urlcollect set handle=1 where url="' + url + '"'
op_mysql = OperationMysql()
op_mysql.update_one(sql)
except Exception:
print('数据插入失败')
def deleteurl(url):
'''
@Description:删除数据库中错误(无效)的URL
@:param None
'''
op_mysql = OperationMysql()
sql = 'delete from news_api_urlcollect where url="' + url['url'] + '"'
op_mysql.delete_one(sql)
def insertalldetial():
'''
@Description:循环进行urlcollect
@:param None
'''
logger.info("Begin Collect News_Url")
urllist = getdatabaseurl()
if None != urllist:
for url in urllist:
logger.info(" Begin to handle url: %s" % url['url'])
news = getnewsdetail(url['url'])
if news == None:
pass
Type = url['type']
if news == None:
deleteurl(url)
else:
try:
insertdatabase(news=news, geturl=url, Type=Type)
except Exception:
return None
logger.error("Insert News_url Error!!")
3. 定时采集器
#创建一个APScheduler对象(用于配置定时任务)
sched = BlockingScheduler()
def begincollect(time):
time = int(time)
try:
# 'interval'关键词表示的是按照固定时间间隔进行的任务 add_job()添加一个定时任务
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect1', kwargs={"lid": "2510",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect2', kwargs={"lid": "2511",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect3', kwargs={"lid": "2669",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect4', kwargs={"lid": "2512",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect5', kwargs={"lid": "2513",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect6', kwargs={"lid": "2514",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect7', kwargs={"lid": "2515",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect8', kwargs={"lid": "2516",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect9', kwargs={"lid": "2517",})
sched.add_job(urlcollect, 'interval', max_instances=1, seconds=time, id='urlcollect10', kwargs={"lid": "2518",})
# urlcollect(lid)
# 为了可以控制定时任务的关闭因此需要在任务开始时保存下该进程的PID值并保存与文件中
# 用于ClossScheduler.py中进行杀死进程
pid = os.getpid()
f1 = open(file='urlSpider.txt', mode='w')
f1.write(pid.__str__())
f1.close()
sched.start()
except Exception:
logger.error('error:'+Exception)
def endsched():
sched.shutdown()
总结
该新闻采集系统可以稳定详细的采集到新浪新闻滚动新闻中的所有新闻,目前仅写了单线程,效率略低。其中还存在一个未知的Bug,就是新闻采集过程中,通过APScheduler控制定时任务时,一旦任务采集任务开始了,是无法直接让采集任务停止的,而是必须让他在对应的周期里完整的运行完当前的任务,才能停下,简单的说就是定时器对任务的控制仅仅是控制了间隔时间和运行次数,而不能即启即停。
此外,采集器还不具有通用性,就是只能针对指定的网页进行内容的采集,这是一个可以拓展的功能,通过对页面进行分析,进行通用性的内容采集的功能。
项目完整的源码已更新,有需要的可以自行下载😀
欢迎提交问题和错误
个人码云主页,欢迎交流!!
个人GitHub主页,欢迎交流!!