这几天学了一点爬虫后写了个爬取电影top250的代码,分别用requests库和urllib库,想看看自己能不能搞出个啥东西,虽然很简单但还是小开心。
import requests
import re
# https://movie.douban.com/top250?start=25&filter=
# <span class="title">勇士</span>
count = 1
def getdata(url):
data = requests.get(url)
return data.text
def showdata(data):
global count
regex = re.compile(r"<span class=\"title\">(.*?)</span>")
data = regex.findall(data)
newdata = data.copy()
for dataa in newdata:
if "nbsp" in dataa:
data.remove(dataa)
for i in data:
print(count, i)
count = count + 1
for i in range(0, 10):
i = i * 25
url = "https://movie.douban.com/top250?start={}&filter=".format(str(i))
data = getdata(url)
showdata(data)
# 用requests来实现,正则表达式解析网页
import urllib
import urllib.request
import re
#https://movie.douban.com/top250?start=25&filter=
#<span class="title">勇士</span>
count = 1
def getdata(url):
data = urllib.request.urlopen(url).read().decode("utf-8")
return data
def showdata(data):
global count
regex = re.compile(r"<span class=\"title\">(.*?)</span>")
data = regex.findall(data)
newdata = data.copy()
for dataa in newdata:
if "nbsp" in dataa:
data.remove(dataa)
for i in data:
print(count,i)
count = count+1
for i in range(0,10):
i = i*25
url = "https://movie.douban.com/top250?start={}&filter=".format(str(i))
data = getdata(url)
showdata(data)
#用urllib来实现,正则表达式解析网页
emmmmmmm