python 爬虫 爬取序列博客文章列表

python中写个爬虫真是太简单了




import urllib.request
from pyquery import PyQuery as PQ

# 根据URL获取内容并解码为UTF-8
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    html = html.decode('UTF-8')
    return html

# 解析返回的html
def getArtical(html, results):
    doc = PQ(html)
    # data = doc('.searchAtcList .searchAtc_top a')
    data = doc('.atc_title a')
    for x in data.items():
        title = x.text()
        href = x.attr('href')
        if title.find('教你炒股票') >= 0:
            # 标题被截断的需要根据URL获取完整的标题
            if title.find('…') >= 0:
                title = getArticalDetail(x.attr('href'))

            r = '[' + title + '](' + href + ')'
            index = title[5 : title.index(':')]
            results.append((int(index),r))

# 获取文章标题
def getArticalDetail(url):
    html = getHtml(url)
    doc = PQ(html)
    data = doc('.articalTitle h2')
    title = data.text()
    return title

blog3 = 'http://blog.sina.com.cn/s/articlelist_1215172700_0_'
# http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html
# http://blog.sina.com.cn/s/articlelist_1215172700_0_15.html
# blog = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='
# blog2 = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='

results = []

# 总共有23页
for i in range(1, 24):
    url = blog3 + str(i) + '.html'
    print(url)
    html = getHtml(url)
    getArtical(html, results)

# 排序后输出
results.sort()
for x in results:
    print(x[1])


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值