day18作业 任意爬取某网站,用XPATH解析
import requests
from lxml import etree
from re import findall
import csv
# 尝试
# response = requests.get('...')
# print(response)
def get_one_page(page):
# 1.获取网页源代码
url = f'...'
response = requests.get(url)
html = response.text
# 2.解吸数据
root = etree.HTML(html)
# 获取所有电影对应div
all_film_div = root.xpath('//div[@class="post-box-container"]')
for div in all_film_div:
name = div.xpath('./div[@class="post-box-text"]/h2/a/text()')[0]
tag = div.xpath('./div[@class="post-box-text"]/span/a/text()')[0]
describe = div.xpath('./div[@class="post-box-text"]/p/text()')
if describe:
describe = describe[0]
else:
describe = ''
img_url = div.xpath('/')
# print(name, tag, describe)
print(f'第{page}页获取成功!')
if __name__ =='__main__':
writer = csv.writer(open('files/films.csv', 'w', encoding='utf-8'))
writer.writerow(['电影名称', '标签', '描述', '图片网址'])
for x in range(1,6):
get_one_page(x)