python爬⾍笔试
列表⻚:
1> ⽂章标题
2> 摘要
3> 时间
4> 列表中⽂章的图⽚
详情⻚:
1> ⽂章题⽬
2> 原⽂标题
3> 原⽂来源
4> ⽂章正⽂
5> 原⽂链接
1.尝试爬取地址为:https://www.theblockbeats.info/topic/87 下⾯的10篇⽂
章并使⽤beautiful soup解析到以下内容
2
[{‘title_1’:{‘summary’:‘’, ‘time’:‘’, ‘icon’: ‘’}},…]
得到结果格式:
列表⻚:
详情⻚:
3
[{‘title’: ‘’, ‘original_title’:‘’, ‘original_source’:‘’, ‘content’: ‘’, ‘original_link’: ‘’ }, …]
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析
import urllib.request, urllib.error # URL操作,获取网页数据
# 爬虫函数
def crawl(url, headers):
page = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(page)
html = response.read().decode("utf-8")
# 利用 BeautifulSoup 标准库,解析页面信息
soup = BeautifulSoup(html, "html.parser")
liebiao = []
i = 0
for item in soup.find_all('div', class_="news-flash-components")[i]:
# 标题
liebiao.append({})
liebiao[i]['title'] = item.a["title"]
# print(item.div.a["title"])
# 摘要
liebiao[i]['summary'] = item.find('div', class_='home-news-lft-content text-ellipsis2').text
# 时间
liebiao[i]['time'] = item.find('div', class_='home-news-time').text
# 图片
liebiao[i]['icon'] = item.div.img["src"]
i += 1
return liebiao
if __name__ == '__main__':
url = "https://www.theblockbeats.info/topic/87"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
print(crawl(url, headers))
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.error
# 爬虫函数
def crawl(url, headers):
page = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(page)
html = response.read().decode("utf-8")
# 解析页面信息
soup = BeautifulSoup(html, "html.parser")
liebiao1 = []
# 获取子页面链接
i = 0
for item in soup.find_all('div', class_="news-flash-components")[i]:
# 标题
liebiao1.append({})
liebiao1[i] = item.a["href"]
# print(item.div.a["title"])
i += 1
return liebiao1
def crawlx(url, headers):
page = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(page)
html = response.read().decode("utf-8")
# 解析页面信息
soup = BeautifulSoup(html, "html.parser")
liebiao = {'title': soup.find('div', class_="news-title").text}
i = 0
for item in soup.find_all('blockquote'):
if i == 0:
liebiao['original_title'] = item.text
i += 1
elif i == 1:
liebiao['original_source'] = item.text
i += 1
else:
break
liebiao['content'] = str(soup.find_all('p'))
liebiao['original_link'] = soup.find_all('blockquote')[-1]
return liebiao
if __name__ == '__main__':
jieguo = []
url = "https://www.theblockbeats.info/topic/87"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
listx = crawl(url, headers)
# 详情页
for i in range(len(listx)):
listx[i] = "https://www.theblockbeats.info" + listx[i]
for i in range(10):
url = listx[i]
jieguo.append(crawlx(url, headers))
print(jieguo)