要爬取的网页
Python程序
#!/usr/bin/python3.4
#-*- coding:utf-8-*-
#FileName:getdangdang.py
#Author:duhongjiang
#Date:2018/2/24 20:08
import requests
from bs4 import BeautifulSoup
import re
print('from douban dianying Top 250')
print('------------------------begain------------------')
try:
file_object = open('thefile.txt', 'w')
allget = [['电影名称','评分','评论人数']]
for page in range(10):
url='https://movie.douban.com/top250?start='+str((page)*25)+'&filter='
print('------------------------',page,'------------------')
print(url)
html = requests.get(url)#get html code
html.raise_for_status() #check requeste
try:
soup = BeautifulSoup(html.text,'html.parser')
soup = str(soup)
title = re.compile(r'<span class="title">([^\/]+)</span>')
outher = re.compile(r'<span class="rating_num" property="v:average">(\d+\.\d+)</span>')
pepole = re.compile(r'<span>(.*)人评价</span>')
#allget = re.compile(r'<span class="title">([^\/]+)</span> | <span class="rating_num" property="v:average">(\d+\.\d+)</span> | <span>(.*)人评价</span>')
outhers = re.findall(outher,soup)
pepoles = re.findall(pepole,soup)
names = re.findall(title,soup)
#allg = re.findall(allget,soup,re.S|re.M)
print(outhers)
print(pepoles)
#print(allg)
outhersit = iter(outhers)
pepolesit = iter(pepoles)
namesit = iter(names)
for name in names:
allget.append([name,next(outhersit),next(pepolesit)])
print(allget)
for name in names:
#if name.find('/')==-1:
print(name)
#file_object.writelines(name+'\n')
for final_data_o in allget:
#for final_data in final_data_o:
for i in range(len(final_data_o)):
if i==(len(final_data_o)-1):
file_object.writelines(final_data_o[i])
else:
file_object.writelines(final_data_o[i]+',')
file_object.writelines('\n')
except Exception as e:
print(e)
finally:
file_object.close()
print('-----------------------end----------------')
程序执行结果
duhj@ubuntu:~/Desktop/work$ python3.4 getdangdang.py
程序处理得到的数据文件
逗号分隔的CSV格式,可以放入Hadoop HDFS文件系统中
电影名称,评分,评论人数
肖申克的救赎,9.6,980298
霸王别姬,9.5,712104
这个杀手不太冷,9.4,925493
阿甘正传,9.4,786983
美丽人生,9.5,459967
千与千寻,9.2,736229
泰坦尼克号,9.2,726576
辛德勒的名单,9.4,419526