基于python实现垂直爬虫系统的方法详解

最新推荐文章于 2022-10-27 09:56:12 发布

大飞攻城狮

最新推荐文章于 2022-10-27 09:56:12 发布

阅读量574

点赞数

分类专栏：零基础入门Python30天文章标签： python 爬虫开发语言

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/m0_63171455/article/details/123281117

版权

零基础入门Python30天专栏收录该内容

4 篇文章 1 订阅

订阅专栏

这篇文章主要为大家详细介绍了python实现垂直爬虫系统的方法，文中示例代码介绍的非常详细，具有一定的参考价值，感兴趣的小伙伴们可以参考一下，希望能够给你带来帮助。Python编程学习资料点击免费领取

html_downloader

1

2

3

4

5

6

7

8

from urllib import request

def download(url):

if url is None:

return

response = request.urlopen(url)

if response.getcode() != 200:

return None

return response.read()

html_outeputer

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

data_list = []

def collect_data(data):

data_list.append(data)

def output_html():

fout = open('output.html', 'w')

fout.write('<html>')

fout.write('<body>')

fout.write('<table>')

for dataitem in data_list:

fout.write('<tr>')

fout.write('<td>%s</td>' % dataitem['url'])

fout.write('<td>%s</td>' % dataitem['title'])

fout.write('<td>%s</td>' % dataitem['datetime'])

fout.write('<td>%s</td>' % dataitem['visitcount'])

fout.write('</tr>')

fout.write('</table>')

fout.write('</body>')

fout.write('</html>')

fout.close()

html_parser

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

import re

from bs4 import BeautifulSoup

from urllib.parse import urljoin

def get_new_urls(page_url, soup):

new_urls = set()

links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))

for link in links:

new_url = link['href']

new_full_url = urljoin(page_url, new_url)

new_urls.add(new_full_url)

return new_urls

def get_new_data(page_url, soup):

res_data = {}

title_node = soup.find('h1', class_='arti-title')

if title_node is None:

return res_data

res_data['title'] = title_node.get_text()

datetime_node = soup.find('span', class_='arti-update')

res_data['datetime'] = datetime_node.get_text()

visitcount_node = soup.find('span', class_='WP_VisitCount')

res_data['visitcount'] = visitcount_node.get_text()

res_data['url'] = page_url

return res_data

def parse(page_url, html_cont):

if page_url is None or html_cont is None:

return

soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

new_urls = get_new_urls(page_url, soup)

new_data = get_new_data(page_url, soup)

return new_urls, new_data

spider_main

test_64

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

from bs4 import BeautifulSoup

import re

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup = BeautifulSoup(html_doc, 'html.parser')

print('获取所有链接')

links = soup.find_all('a')

for link in links:

print(link.name, link['href'], link.get_text())

print('获取lacie链接')

link_node = soup.find('a', href='http://example.com/lacie')

print(link_node.name, link_node['href'], link_node.get_text())

print('正则匹配')

link_node = soup.find('a', href=re.compile(r'ill'))

print(link_node.name, link_node['href'], link_node.get_text())

print('获取P段落文字')

p_node = soup.find('p', class_='title')

print(p_node.name, p_node.get_text())

urls_manager

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

new_urls = set()

old_urls = set()

def add_new_url(url):

if url is None:

return

if url not in new_urls and url not in old_urls:

new_urls.add(url)

def add_new_urls(urls):

if urls is None or len(urls) == 0:

return

for url in urls:

add_new_url(url)

def get_new_url():

new_url = new_urls.pop()

old_urls.add(new_url)

return new_url

def has_new_url():

return len(new_urls) != 0

总结

本篇文章就到这里了，希望能够给你带来帮助。

①3000多本Python电子书有
②Python开发环境安装教程有
③Python400集自学视频有
④软件开发常用词汇有
⑤Python学习路线图有
⑥项目源码案例分享有

如果你用得到的话可以直接拿走，在我的QQ技术交流群里（技术交流和资源共享，广告勿入）可以自助拿走，群号是895937462。

大飞攻城狮

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。