<pre name="code" class="python">#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
import threading
import queue
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
q = queue.Queue()
result = []
def get_video_page_urls():
response = requests.get(index_url)
soup = BeautifulSoup(response.text, "lxml")
return [a.get('href') for a in soup('a', class_='thumbnail')]
def get_video_msg(video_url):
# print('url=', root_url + video_url)
video_data = {}
response = requests.get(root_url + video_url)
soup = BeautifulSoup(response.text, 'lxml')
tag = soup.find(id='sidebar')
video_data['Category'] = tag.find('a', href=re.compile('category')).string
try:
video_data['Speakers'] = tag.find('meta', property='author').get('content')
except:
video_data['Speakers'] = 'Unknown'
video_data['Language'] = tag.find('meta', property='inLanguage').previous_element.strip()
video_data['Recorded'] = tag.find('meta', property='dateCreated').previous_element.strip()
try:
video_data['Video origin'] = tag.find('a', property='embedUrl').get('href')
except:
video_data['Video origin'] = 'Unknown'
# print(video_data['Category'])
# print(video_data['Speakers'])
# print(video_data['Language'])
# print(video_data['Recorded'])
# print(video_data['Video origin'])
return video_data
def show_video_stats():
video_list = get_video_page_urls()
for video_url in video_list:
print(get_video_msg(video_url))
class Mythread(threading.Thread):
global result
global q
def __init__(self, name):
threading.Thread.__init__(self)
self.name = name
def run(self):
while True:
if q.empty():
# q.task_done()
break
print('thread:', self.name)
url = q.get()
result.append(get_video_msg(url))
q.task_done()
# get_video_msg('/video/2668/writing-restful-web-services-with-flask')
# show_video_stats()
def main():
video_list = get_video_page_urls()
for video_url in video_list:
q.put(video_url)
for i in range(8):
Mythread(i).start()
q.join()
for msg in result:
print(msg)
main()
参考教程,练习BeautifulSoup实例
最新推荐文章于 2025-02-07 17:23:46 发布