from requests_plus import get
from requests_plus import pq, getPq
uid = '014534808'
# cid = '10711684' # Python从入门到精通
cid = '11432885' # 爬虫
url = f'https://blog.csdn.net/u{uid}/category_{cid}.html'
r, t, items = getPq(url, {
'person': ['.column_person_tit', lambda _: _.text()],
'aCount': ['.column_data span', lambda _: _.eq(0).find('.mumber-color').text()],
# 'aItems': ['.column_article_list a', lambda _: _.text().find('Python从入门到精通') >= 0],
'aItems': ['.column_article_list a']
})
print(t, '文章数', items.get('aCount'), items.get('person'))
for k, _ in enumerate(items.get('aItems')):
a = _.attr('href')
b = _.find('h2')
b.find('span').remove()
b = b.text().replace('❤', '').strip()
b = b.replace(b'\xef\xb8\x8f'.decode('utf-8'), '') # 通过转码获取非法字符进而处理字符串
print(a, b)
requests_plus 正在优化成类,这里就不展示了