用循环爬出豆瓣 top250 的所有网页
于是就有了豆瓣电影 top250 的所有网页
由于这 10 个页面都是一样的结构,所以我们只要能解析其中一个页面就能循环得到所有信息
所以现在的程序就只剩下了解析 HTML
请观察规律,解析出
1,电影名
2,分数
3,评价人数
4,引用语(比如第一部肖申克的救赎中的「希望让人自由。」)
# encoding: utf-8
import socket
import ssl
def get(url):
m = url.split('://') #分离协议与主机地址
u = m[1]
i = u.find('/')
host = u[:i]
path = u[i:]
if 'https' == m[0]:
port = 443
s = ssl.wrap_socket(socket.socket())
else:
port = 80
s = socket.socket()
s.connect((host, port))
request = 'GET {} HTTP/1.1\r\nhost:{}\r\n\r\n'.format(path, host)
encoding = 'utf-8'
s.send(request.encode(encoding))
response = b''
buffer_size = 1024
while True:
r = s.recv(buffer_size)
response += r
if len(r) < buffer_size:
break
return response.decode(encoding)
def get_name(r):
n = []
u = r.split('<div class="info">')#分离片名头
del u[0] #去掉不需要的部分
for i in u:
a = i.split('<span class="title">')[1]
m = a.find('</span>')
n.append(a[:m])
return n
def get_score(r):
s = []
u = r.split('<span class="rating_num" property="v:average">')
del u[0]
for i in u:
m = i.find('</span>')
s.append(i[:m])
return s
def get_comment(r):
c= []
u = r.split('<span property="v:best" content="10.0"></span>')
del u[0]
for i in u:
a = i.split('<span>')[1]
m = a.find('</span>')
c.append(a[:m])
return c
def get_quote(r):
q = []
u = r.split('<span property="v:best" content="10.0"></span>') #避免没有短评的情况
del u[0]
for i in u:
p = i.find('<span class="inq">')
if p == -1:
q.append('')
else:
n = i.split('<span class="inq">')[1]
m = n.find('</span>')
q.append(n[:m])
return q
def main():
url = 'https://movie.douban.com/top250'
r = get(url)
name = get_name(r)
score = get_score(r)
comment = get_comment(r)
quote = get_quote(r)
i = 25
while True:
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
r = get(url)
name += get_name(r)
score += get_score(r)
comment += get_comment(r)
quote += get_quote(r)
i += 25
if i ==250:
break
count = 0
while True:
print(name[count],score[count],comment[count],quote[count])
count += 1
if count >= 250:
break
if __name__ == '__main__':
main()