以前写的一个Python小程序,以前是放在笔记中的,现搬到这来。
因为Android开发需要一些数据,自己写了一个小小的Python程序来抓取数据。过程可谓一波三折,主要是Python的字符串编码问题,在这记录一下。
直接上代码
# encoding utf-8
import urllib2
import json
from bs4 import BeautifulSoup
domain = 'http://www.joy.cn/news/'
def start_parser(domain_url):
response = urllib2.urlopen(domain_url)
html = response.read()
soup = BeautifulSoup(html)
video_data = {a.get_text(): domain + a.attrs.get('href') for a in soup.select('div.joy_news_div a.joy_item_a')}
return video_data
def get_video_url(video_page_url):
response = urllib2.urlopen(video_page_url)
html = response.read()
soup = BeautifulSoup(html)
video_url = soup.select('div.video source')[0].attrs.get('src')
return video_url
def generate_json_file(domain_url):
url_list = []
page_urls = s