基于python实现垂直爬虫系统的方法详解

这篇文章主要为大家详细介绍了python实现垂直爬虫系统的方法,文中示例代码介绍的非常详细,具有一定的参考价值,感兴趣的小伙伴们可以参考一下,希望能够给你带来帮助。Python编程学习资料点击免费领取

html_downloader

1

2

3

4

5

6

7

8

from urllib import request

def download(url):

    if url is None:

        return

    response = request.urlopen(url)

    if response.getcode() != 200:

        return None

    return response.read()

html_outeputer

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

data_list = []

def collect_data(data):

    data_list.append(data)

def output_html():

    fout = open('output.html', 'w')

    fout.write('<html>')

    fout.write('<body>')

    fout.write('<table>')

    for dataitem in data_list:

        fout.write('<tr>')

        fout.write('<td>%s</td>' % dataitem['url'])

        fout.write('<td>%s</td>' % dataitem['title'])

        fout.write('<td>%s</td>' % dataitem['datetime'])

        fout.write('<td>%s</td>' % dataitem['visitcount'])

        fout.write('</tr>')

    fout.write('</table>')

    fout.write('</body>')

    fout.write('</html>')

    fout.close()

html_parser

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

import re

from bs4 import BeautifulSoup

from urllib.parse import urljoin

def get_new_urls(page_url, soup):

    new_urls = set()

    links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))

    for link in links:

        new_url = link['href']

        new_full_url = urljoin(page_url, new_url)

        new_urls.add(new_full_url)

    return new_urls

def get_new_data(page_url, soup):

    res_data = {}

    title_node = soup.find('h1', class_='arti-title')

    if title_node is None:

        return res_data

    res_data['title'] = title_node.get_text()

    datetime_node = soup.find('span', class_='arti-update')

    res_data['datetime'] = datetime_node.get_text()

    visitcount_node = soup.find('span', class_='WP_VisitCount')

    res_data['visitcount'] = visitcount_node.get_text()

    res_data['url'] = page_url

    return res_data

def parse(page_url, html_cont):

    if page_url is None or html_cont is None:

        return

    soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')

    new_urls = get_new_urls(page_url, soup)

    new_data = get_new_data(page_url, soup)

    return new_urls, new_data

spider_main

test_64

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

from bs4 import BeautifulSoup

import re

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup = BeautifulSoup(html_doc, 'html.parser')

print('获取所有链接')

links = soup.find_all('a')

for link in links:

    print(link.name, link['href'], link.get_text())

print('获取lacie链接')

link_node = soup.find('a', href='http://example.com/lacie')

print(link_node.name, link_node['href'], link_node.get_text())

print('正则匹配')

link_node = soup.find('a', href=re.compile(r'ill'))

print(link_node.name, link_node['href'], link_node.get_text())

print('获取P段落文字')

p_node = soup.find('p', class_='title')

print(p_node.name, p_node.get_text())

urls_manager

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

new_urls = set()

old_urls = set()

def add_new_url(url):

    if url is None:

        return

    if url not in new_urls and url not in old_urls:

        new_urls.add(url)

def add_new_urls(urls):

    if urls is None or len(urls) == 0:

        return

    for url in urls:

        add_new_url(url)

def get_new_url():

    new_url = new_urls.pop()

    old_urls.add(new_url)

    return new_url

def has_new_url():

    return len(new_urls) != 0

总结

本篇文章就到这里了,希望能够给你带来帮助。

①3000多本Python电子书有
②Python开发环境安装教程有
③Python400集自学视频有
④软件开发常用词汇有
⑤Python学习路线图有
⑥项目源码案例分享有

如果你用得到的话可以直接拿走,在我的QQ技术交流群里(技术交流和资源共享,广告勿入)可以自助拿走,群号是895937462。

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值