简易的网络爬虫代码-python

最新推荐文章于 2021-02-05 17:23:08 发布

longfei_2010

最新推荐文章于 2021-02-05 17:23:08 发布

阅读量506

点赞数

#encoding:utf-8

# eg： 以时光网电影排名为例
# author：hou

from bs4 import BeautifulSoup
import urllib
import urlparse
import sys
reload(sys)
sys.setdefaultencoding('utf8')

# 获取URL中的参数
def geturlparams(url):
    query = urlparse.urlparse(url).query
    return dict([(k, v[0]) for k, v in urlparse.parse_qs(query).items()])

# 配置抓取网页的URL
def geturl(year,page,area):
    url = 'http://movie.mtime.com/boxoffice/?year=%s' % year + \
          '&area=%s'%area + \
          '&type=MovieRankingYear&category=all&page=%s' % page + \
          '&display=list&timestamp=1515392838844&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json'
    return url

# 解析HTML并导出所需数据
def export(file,url):
    # 输出标签dd中的内容
    a = urllib.urlopen(url).read()
    b = BeautifulSoup(a, "html.parser")
    c = b.find_all('dd')
    if len(c) > 0:
        for z in range(0, len(c)):
            print c[z].i.string
            print c[z].h3.string
            print c[z].h4.string
            record(file,c[z].i.string)
            record(file, c[z].h3.string)
            record(file, c[z].h4.string)
            d = c[z].findAll('p')
            for e in d:
                print e.get_text().encode('utf-8')
                record(file, e.get_text().encode('utf-8'))
            print "************************************************"
            record(file, "************************************************")

# 记录数据到文件中
def record(file,param):
    file.write(param + '\r\n')

# 网页抓取函数，节约代码
def scraping(file,pagecount,minyear,maxyear,area):
    for page in range(0, pagecount):
        for year in range(minyear, maxyear):
            url = geturl(year, page, area)
            export(file, url)

# # 定义全局参数
# area = ['china','NorthAmerica','global']
# # 定义抓取页数
# pagecount = 1
# # 定义抓取起始年份、终止年份
# minyear = 2016
# maxyear = 2017
# print "开始抓取数据，所需时间较长，请等待……"
#
# # 在脚本当前目录下创建保存的文件txt
# timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
# filename = 'Mtime film ranking_%s' % timestamp + '.txt'
# file_new = open (filename, 'w')
#
# # 导出中国排名
# record(file_new,"开始抓取内地票房，请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[0])
# record(file_new,"抓取内地票房完成！！！！！")
# record(file_new, "************************************************")
#
# # 导出北美排名
# record(file_new,"开始抓取北美票房，请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[1])
# record(file_new,"抓取北美票房完成！！！！！")
# record(file_new, "************************************************")
#
# # 导出全球排名
# record(file_new,"开始抓取全球票房，请等待……")
# scraping(file_new,pagecount,minyear,maxyear,area[2])
# record(file_new,"抓取全球票房完成！！！！！")
# record(file_new, "************************************************")
#
# file_new.close()
# print "全部数据已经抓取完成！！！可前去查看生成的文件：%s" % filename

print  geturlparams('http://movie.mtime.com/boxoffice/?year=2016&area=china&type=MovieRankingYear&category=all&page=0&display=list&timestamp=1515573624641&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json')