参考教程,练习BeautifulSoup实例

<pre name="code" class="python">#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import re
import threading
import queue

root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
q = queue.Queue()
result = []


def get_video_page_urls():
    response = requests.get(index_url)
    soup = BeautifulSoup(response.text, "lxml")
    return [a.get('href') for a in soup('a', class_='thumbnail')]


def get_video_msg(video_url):
    # print('url=', root_url + video_url)
    video_data = {}
    response = requests.get(root_url + video_url)
    soup = BeautifulSoup(response.text, 'lxml')
    tag = soup.find(id='sidebar')
    video_data['Category'] = tag.find('a', href=re.compile('category')).string
    try:
        video_data['Speakers'] = tag.find('meta', property='author').get('content')
    except:
        video_data['Speakers'] = 'Unknown'
    video_data['Language'] = tag.find('meta', property='inLanguage').previous_element.strip()
    video_data['Recorded'] = tag.find('meta', property='dateCreated').previous_element.strip()
    try:
        video_data['Video origin'] = tag.find('a', property='embedUrl').get('href')
    except:
        video_data['Video origin'] = 'Unknown'
    # print(video_data['Category'])
    # print(video_data['Speakers'])
    # print(video_data['Language'])
    # print(video_data['Recorded'])
    # print(video_data['Video origin'])
    return video_data


def show_video_stats():
    video_list = get_video_page_urls()
    for video_url in video_list:
        print(get_video_msg(video_url))


class Mythread(threading.Thread):
    global result
    global q

    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name

    def run(self):
        while True:
            if q.empty():
                # q.task_done()
                break
            print('thread:', self.name)
            url = q.get()
            result.append(get_video_msg(url))
            q.task_done()


# get_video_msg('/video/2668/writing-restful-web-services-with-flask')
# show_video_stats()

def main():
    video_list = get_video_page_urls()
    for video_url in video_list:
        q.put(video_url)
    for i in range(8):
        Mythread(i).start()
    q.join()
    for msg in result:
        print(msg)


main()



                
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值