1 爬取网站
猫眼: 点击跳转.
2 代码
import requests
from bs4 import BeautifulSoup
import time
def get_one_page(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_one_page(soup):
result = []
for ul in soup.find_all(name='dd'):
re = []
re.append((ul.find(name='i').text))
re.append((ul.find(name='p').text))
re.append((ul.find(name='img', attrs='board-img').attrs['data-src']))
re.append((ul.find(name='p', attrs='star').text))
re.append((ul.find(name='p', attrs='releasetime').text))
re.append((ul.find(name='p', attrs='score').text))
result.append(re)
return result
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
time.sleep(1)
html = get_one_page(url)
soup = BeautifulSoup(html, 'lxml')
return parse_one_page(soup)
print(main(10))