Code
"""
@author: MR.N
@created: 2022/9/20 Mon. 23:08
@file: se_sohu_1
@version: 1.0
"""
from httpkit import *
from scrapy import Selector
# import re
from filtertags import *
import time
import os
def test(url='https://www.sohu.com'):
remote_task = RemoteTask(url=url)
ret = []
code = get_res_objects2(remote_task=remote_task, ret=ret, dtype=0)
if code != 'success':
return None
content = ret[0]
sel = Selector(text=content)
# 新闻标题包含于具有特定特征码的A标签
links = sel.xpath('//a').getall()
sources = []
for link in links:
# 包含新闻标题的A标签具有title属性
if 'title="' in link:
sources.append(filter_html_tags(link).replace(' ', '').replace('\n', ''))
if len(sources) < 1:
print('err', content)
return sources
if __name__ == '__main__':
items = test()
# print('\n\n'.join(items))
for item in items:
print('\n\n\t' + item)
time.sleep(.6)
Example