python 网络爬虫

置顶圆弧YH

已于 2024-06-22 13:20:02 修改

阅读量532

点赞数 1

文章标签： python

于 2023-06-11 09:53:48 首次发布

本文链接：https://blog.csdn.net/weixin_46205351/article/details/131150251

版权

爬虫基本操作

requests-请求是否成功

积累程序-爬虫 -requests-请求是否成功.

import requests
response=requests.get("https://www.pku.edu.cn")
print(response.status_code)#用来检查请求是否正确响应，如果状态码是200,代表请求成功。
#4XX，客户端错误，403，禁止访问。5XX,服务器错误，503,服务器不可用。3XX，重定向，305,应使用代理访问。1XX，请求收到。2XX，请求成功。

将文件写入本地

import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)

或者用如下代码

#【舉例】
import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)


#【舉例】

for i in range(a,a+3):#调试的时候将b换成a+1
    webUrl="https://zh.m.wikisource.org/wiki/春秋左傳正義/卷"+str(i)
    urlList.append(webUrl)

    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"

    data=webFile.text

    myDfile="myDownload"+str(i)+".html"

#第一種
    with open(myDfile,"w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
        file1.write(data)
#第二種，用的是write,而不是print
    wFile=open(myDfile,"w",encoding="utf-8")
    wFile.write(data)
    wFile.close()

观察网站结构

在爬取过程中，需要观察网站的结构。

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
</body>
</html>
"""
 
 
from bs4 import BeautifulSoup

 
soup=BeautifulSoup(html,'html.parser')

#print(soup)
#print(type(soup))#BeautifulSou

tag=soup.find('p')
#print(tag)#Tag
string1=tag.string
#print(string1)#NavigableString
 

soup2=BeautifulSoup("<b><!--Hey--></b>",'html.parser')
comment=soup2.b.string
#print(comment)
#print(type(comment))


soup3=BeautifulSoup('<ad id=123 class="red bule">Hey</ad>','html.parser')
tag=soup3.ad
##print(tag.name)#ad是自己定义的，命名的。
##print(tag.attrs)

##Tag对象类似于HTML文档的标签.
##
##对于标签来说,最重要的就是名字name和属性attrs.


#修改soup的信息
soup=BeautifulSoup('<p id=123 class="red bule">Hey</p>','html.parser')
tag=soup.p
tag.name='a'
tag.attrs['id']=456
tag.attrs['class'][0]='white'
#print(soup)


from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
##print(tag)
##string=tag.string
##print(string)
##print(type(string))
##
##print(string.split('e'))
##
##print(string.lower())


#NavigableString同样可以被直接修改,也可以使用repalce_with的方法来修改.
from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
a='Heloo'
tag.string=a
##print(soup)
##tag.string.replace_with('KO')
##print(soup)

html = """
<div>Total
    <p class="story"> First_p
        <a id="1">El</a>,
        <a id="2">E2</a>,
        <a id="3">E3</a>,
    </p>
    <p>Second_p</p>
</div>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,'html.parser')
#print(soup)
tag=soup.p
#print(tag)
#首先,存在多个标签时,使用标签名称取到的永远是第一个该标签.其余返回的是none
##
##print(len(tag.contents))
##print(tag.contents)


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<div>
<p class="a title"><b>The Dormouse's story</b></p>
<p class="a story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
</div>
<div>
<p class="st">Last<p class="st">......</p></p>
</div>
"""
 
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#标签就是HTML的标签.比如搜索html文档中的所有a标签.
#print(soup.find_all('a'))


#可以接受正则表达式作为过滤,比如所有名称中包含'a'的标签.
print()
import  re 
#print(soup.find_all(re.compile('a')))
##
##
##列表中所包含的元素都将作为过滤标准,比如搜索所有的a标签和b标签.
#print(soup.find_all(['a','b']))
#print()



#print(soup.find_all('p')[1].find_all(True))
##输出
##[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
## <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
## <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


def filter(tag):
    return tag['id']=='link2'

#print(soup.find_all('p')[1])
 
##print(soup.find_all('p')[1].find_all(filter))
##
##print(soup.find_all('p')[1].find_all(filter))
#输出:
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

print(soup.select('.st'))

#BeautifulSoup提供了一个prettify()方法可以对不完整或者不规范的HTML文档进行规整.

改变后面的备注名。htm,或者txt，就可以改变格式。很神奇。
如何创建htm？网上可搜。创建一个网站，可以用网络打开。

电脑信息字段

有时候，需要一个电脑信息字段

import requests
import csv


headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',

    'limit': '20',
    'sort_by': 'created'
}

beautiful soup的使用

Help on class BeautifulSoup in module bs4:

class BeautifulSoup(bs4.element.Tag)
 |  BeautifulSoup(markup='', features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs)
 |
 |  A data structure representing a parsed HTML or XML document.
 |
 |  Most of the methods you'll call on a BeautifulSoup object are inherited from
 |  PageElement or Tag.
 |
 |  Internally, this class defines the basic interface called by the
 |  tree builders when converting an HTML/XML document into a data
 |  structure. The interface abstracts away the differences between
 |  parsers. To write a new tree builder, you'll need to understand
 |  these methods as a whole.
 |
 |  These methods will be called by the BeautifulSoup constructor:
 |    * reset()
 |    * feed(markup)
 |
 |  The tree builder may call these methods from its feed() implementation:
 |    * handle_starttag(name, attrs) # See note about return value
 |    * handle_endtag(name)
 |    * handle_data(data) # Appends to the current data node
 |    * endData(containerClass) # Ends the current data node
 |
 |  No matter how complicated the underlying parser is, you should be
 |  able to build a tree using 'start tag' events, 'end tag' events,
 |  'data' events, and "done with data" events.
 |
 |  If you encounter an empty-element tag (aka a self-closing tag,
 |  like HTML's <br> tag), call handle_starttag and then
 |  handle_endtag.
 |
 |  Method resolution order:
 |      BeautifulSoup
 |      bs4.element.Tag
 |      bs4.element.PageElement
 |      builtins.object
 |
 |  Methods defined here:
 |
 |  __copy__(self)
 |      Copy a BeautifulSoup object by converting the document to a string and parsing it again.
 |
 |  __getstate__(self)
 |
 |      Constructor.
 |
 |      :param markup: A string or a file-like object representing
 |       markup to be parsed.
 |
 |      :param features: Desirable features of the parser to be
 |       used. This may be the name of a specific parser ("lxml",
 |       "lxml-xml", "html.parser", or "html5lib") or it may be the
 |       type of markup to be used ("html", "html5", "xml"). It's
 |       recommended that you name a specific parser, so that
 |       Beautiful Soup gives you the same results across platforms
 |       and virtual environments.
 |
 |      :param builder: A TreeBuilder subclass to instantiate (or
 |       instance to use) instead of looking one up based on
 |       `features`. You only need to use this if you've implemented a
 |       custom TreeBuilder.
 |
 |      :param parse_only: A SoupStrainer. Only parts of the document
 |       matching the SoupStrainer will be considered. This is useful
 |       when parsing part of a document that would otherwise be too
 |       large to fit into memory.
 |
 |      :param from_encoding: A string indicating the encoding of the
 |       document to be parsed. Pass this in if Beautiful Soup is
 |       guessing wrongly about the document's encoding.
 |
 |      :param exclude_encodings: A list of strings indicating
 |       encodings known to be wrong. Pass this in if you don't know
 |       the document's encoding but you know Beautiful Soup's guess is
 |       wrong.
 |
 |      :param element_classes: A dictionary mapping BeautifulSoup
 |       classes like Tag and NavigableString, to other classes you'd
 |       like to be instantiated instead as the parse tree is
 |       built. This is useful for subclassing Tag or NavigableString
 |       to modify default behavior.
 |
 |      :param kwargs: For backwards compatibility purposes, the
 |       constructor accepts certain keyword arguments used in
 |       Beautiful Soup 3. None of these arguments do anything in
 |       Beautiful Soup 4; they will result in a warning and then be
 |       ignored.
 |
 |       Apart from this, any keyword arguments passed into the
 |       BeautifulSoup constructor are propagated to the TreeBuilder
 |       constructor. This makes it possible to configure a
 |       TreeBuilder by passing in arguments, not just by saying which
 |       one to use.
 |
 |  decode(self, pretty_print=False, eventual_encoding='utf-8', formatter='minimal')
 |      Returns a string or Unicode representation of the parse tree
 |          as an HTML or XML document.
 |
 |      :param pretty_print: If this is True, indentation will be used to
 |          make the document more readable.
 |      :param eventual_encoding: The encoding of the final document.
 |          If this is None, the document will be a Unicode string.
 |
 |  endData(self, containerClass=None)
 |      Method called by the TreeBuilder when the end of a data segment
 |      occurs.
 |
 |  handle_data(self, data)
 |      Called by the tree builder when a chunk of textual data is encountered.
 |
 |  handle_endtag(self, name, nsprefix=None)
 |      Called by the tree builder when an ending tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |
 |  handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None)
 |      Called by the tree builder when a new tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |      :param attrs: A dictionary of attribute values.
 |      :param sourceline: The line number where this tag was found in its
 |          source document.
 |      :param sourcepos: The character position within `sourceline` where this
 |          tag was found.
 |
 |      If this method returns None, the tag was rejected by an active
 |      SoupStrainer. You should proceed as if the tag had not occurred
 |      in the document. For instance, if this was a self-closing tag,
 |      don't call handle_endtag.
 |
 |  insert_after(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  insert_before(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  new_string(self, s, subclass=None)
 |      Create a new NavigableString associated with this BeautifulSoup
 |      object.
 |
 |  new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs)
 |      Create a new Tag associated with this BeautifulSoup object.
 |
 |  object_was_parsed(self, o, parent=None, most_recent_element=None)
 |      Method called by the TreeBuilder to integrate an object into the parse tree.
 |
 |  popTag(self)
 |      Internal method called by _popToTag when a tag is closed.
 |
 |  pushTag(self, tag)
 |      Internal method called by handle_starttag when a tag is opened.
 |
 |  reset(self)
 |      Reset this object to a state as though it had never parsed any
 |      markup.
 |
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |
 |  ASCII_SPACES = ' \n\t\x0c\r'
 |
 |  DEFAULT_BUILDER_FEATURES = ['html', 'fast']
 |
 |  NO_PARSER_SPECIFIED_WARNING = 'No parser was explicitly specified, so ...
 |
 |  ROOT_TAG_NAME = '[document]'
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.Tag:
 |
 |  __bool__(self)
 |      A tag is non-None even if it has no contents.
 |
 |  __call__(self, *args, **kwargs)
 |      Calling a Tag like a function is the same as calling its
 |      find_all() method. Eg. tag('a') returns a list of all the A tags
 |      found within this tag.
 |
 |  __contains__(self, x)
 |
 |  __delitem__(self, key)
 |      Deleting tag[key] deletes all 'key' attributes for the tag.
 |
 |  __eq__(self, other)
 |      Returns true iff this Tag has the same name, the same attributes,
 |      and the same contents (recursively) as `other`.
 |
 |  __getattr__(self, tag)
 |      Calling tag.subtag is the same as calling tag.find(name="subtag")
 |
 |  __getitem__(self, key)
 |      tag[key] returns the value of the 'key' attribute for the Tag,
 |      and throws an exception if it's not there.
 |
 |  __hash__(self)
 |      Return hash(self).
 |
 |  __iter__(self)
 |      Iterating over a Tag iterates over its contents.
 |
 |  __len__(self)
 |      The length of a Tag is the length of its list of contents.
 |
 |  __ne__(self, other)
 |      Returns true iff this Tag is not identical to `other`,
 |      as defined in __eq__.
 |
 |  __repr__ = __unicode__(self)
 |
 |  __setitem__(self, key, value)
 |      Setting tag[key] sets the value of the 'key' attribute for the
 |      tag.
 |
 |  __str__ = __unicode__(self)
 |
 |  __unicode__(self)
 |      Renders this PageElement as a Unicode string.
 |
 |  childGenerator(self)
 |      Deprecated generator.
 |
 |  clear(self, decompose=False)
 |      Wipe out all children of this PageElement by calling extract()
 |         on them.
 |
 |      :param decompose: If this is True, decompose() (a more
 |          destructive method) will be called instead of extract().
 |
 |  decode_contents(self, indent_level=None, eventual_encoding='utf-8', formatter='minimal')
 |      Renders the contents of this tag as a Unicode string.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The tag is destined to be
 |         encoded into this encoding. decode_contents() is _not_
 |         responsible for performing that encoding. This information
 |         is passed in so that it can be substituted in if the
 |         document contains a <META> tag that mentions the document's
 |         encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |  decompose(self)
 |      Recursively destroys this PageElement and its children.
 |
 |      This element will be removed from the tree and wiped out; so
 |      will everything beneath it.
 |
 |  encode(self, encoding='utf-8', indent_level=None, formatter='minimal', errors='xmlcharrefreplace')
 |      Render a bytestring representation of this PageElement and its
 |      contents.
 |
 |      :param encoding: The destination encoding.
 |      :param indent_level: Each line of the rendering will be
 |          indented this many spaces. Used internally in
 |          recursive calls while pretty-printing.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :param errors: An error handling strategy such as
 |          'xmlcharrefreplace'. This value is passed along into
 |          encode() and its value should be one of the constants
 |          defined by Python.
 |      :return: A bytestring.
 |
 |  encode_contents(self, indent_level=None, encoding='utf-8', formatter='minimal')
 |      Renders the contents of this PageElement as a bytestring.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The bytestring will be in this encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |      :return: A bytestring.
 |
 |  find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |      Look in the children of this PageElement and find the first
 |      PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  findAll = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  findChild = find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |
 |  findChildren = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |      Look in the children of this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find_all() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  get(self, key, default=None)
 |      Returns the value of the 'key' attribute for the tag, or
 |      the value given for 'default' if it doesn't have that
 |      attribute.
 |
 |
 |  get_attribute_list(self, key, default=None)
 |      The same as get(), but always returns a list.
 |
 |      :param key: The attribute to look for.
 |      :param default: Use this value if the attribute is not present
 |          on this PageElement.
 |      :return: A list of values, probably containing only a single
 |          value.
 |
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  has_attr(self, key)
 |      Does this PageElement have an attribute with the given name?
 |
 |  has_key(self, key)
 |      Deprecated method. This was kind of misleading because has_key()
 |      (attributes) was different from __in__ (contents).
 |
 |      has_key() is gone in Python 3, anyway.
 |
 |  index(self, element)
 |      Find the index of a child by identity, not value.
 |
 |      Avoids issues with tag.contents.index(element) getting the
 |      index of equal elements.
 |
 |      :param element: Look for this PageElement in `self.contents`.
 |
 |  prettify(self, encoding=None, formatter='minimal')
 |      Pretty-print this PageElement as a string.
 |
 |      :param encoding: The eventual encoding of the string. If this is None,
 |          a Unicode string will be returned.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :return: A Unicode string (if encoding==None) or a bytestring
 |          (otherwise).
 |
 |  recursiveChildGenerator(self)
 |      Deprecated generator.
 |
 |  renderContents(self, encoding='utf-8', prettyPrint=False, indentLevel=0)
 |      Deprecated method for BS3 compatibility.
 |
 |  select(self, selector, namespaces=None, limit=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      This uses the SoupSieve library.
 |
 |      :param selector: A string containing a CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param limit: After finding this number of results, stop looking.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  select_one(self, selector, namespaces=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      :param selector: A CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  smooth(self)
 |      Smooth out this element's children by consolidating consecutive
 |      strings.
 |
 |      This makes pretty-printed output look more natural following a
 |      lot of operations that modified the tree.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.Tag:
 |
 |  children
 |      Iterate over all direct children of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  descendants
 |      Iterate over all children of this PageElement in a
 |      breadth-first sequence.
 |
 |      :yield: A sequence of PageElements.
 |
 |  isSelfClosing
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  is_empty_element
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  strings
 |      Yield all strings of certain classes, possibly stripping them.
 |
 |      :param strip: If True, all strings will be stripped before being
 |          yielded.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :yield: A sequence of strings.
 |
 |  stripped_strings
 |      Yield all strings in the document, stripping them first.
 |
 |      :yield: A sequence of stripped strings.
 |
 |  text
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.Tag:
 |
 |  parserClass
 |
 |  string
 |      Convenience property to get the single string within this
 |      PageElement.
 |
 |      TODO It might make sense to have NavigableString.string return
 |      itself.
 |
 |      :return: If this element has a single string child, return
 |       value is that string. If this element has one child tag,
 |       return value is the 'string' attribute of the child tag,
 |       recursively. If this element is itself a string, has no
 |       children, or has more than one child, return value is None.
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.PageElement:
 |
 |  append(self, tag)
 |      Appends the given PageElement to the contents of this one.
 |
 |      :param tag: A PageElement.
 |
 |  extend(self, tags)
 |      Appends the given PageElements to this one's contents.
 |
 |      :param tags: A list of PageElements.
 |
 |  extract(self)
 |      Destructively rips this element out of the tree.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  fetchNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, li
mit=None, **kwargs)
 |
 |  fetchParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  fetchPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit=N
one, **kwargs)
 |
 |  fetchPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=
None, limit=None, **kwargs)
 |
 |  findAllNext = find_all_next(self, name=None, attrs={}, text=None, limit=None, *
*kwargs)
 |
 |  findAllPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit
=None, **kwargs)
 |
 |  findNext = find_next(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findNextSibling = find_next_sibling(self, name=None, attrs={}, text=None, **kwa
rgs)
 |
 |  findNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, lim
it=None, **kwargs)
 |
 |  findParent = find_parent(self, name=None, attrs={}, **kwargs)
 |
 |  findParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  findPrevious = find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findPreviousSibling = find_previous_sibling(self, name=None, attrs={}, text=Non
e, **kwargs)
 |
 |  findPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=N
one, limit=None, **kwargs)
 |
 |  find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all PageElements that match the given criteria and appear
 |      later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet containing PageElements.
 |
 |  find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Look backwards in the document from this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_next(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the first PageElement that matches the given criteria and
 |      appears later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the closest sibling to this PageElement that matches the
 |      given criteria and appears later in the document.
 |
 |      All find_* methods take a common set of arguments. See the
 |      online documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all siblings of this PageElement that match the given criteria
 |      and appear later in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_parent(self, name=None, attrs={}, **kwargs)
 |      Find the closest parent of this PageElement that matches the given
 |      criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |      Find all parents of this PageElement that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |      Look backwards in the document from this PageElement and find the
 |      first PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Returns the closest sibling to this PageElement that matches the
 |      given criteria and appears earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwar
gs)
 |      Returns all siblings to this PageElement that match the
 |      given criteria and appear earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  format_string(self, s, formatter)
 |      Format the given string using the given formatter.
 |
 |      :param s: A string.
 |      :param formatter: A Formatter object, or a string naming one of the standar
d formatters.
 |
 |  formatter_for_name(self, formatter)
 |      Look up or create a Formatter for the given identifier,
 |      if necessary.
 |
 |      :param formatter: Can be a Formatter object (used as-is), a
 |          function (used as the entity substitution hook for an
 |          XMLFormatter or HTMLFormatter), or a string (used to look
 |          up an XMLFormatter or HTMLFormatter in the appropriate
 |          registry.
 |
 |  insert(self, position, new_child)
 |      Insert a new PageElement in the list of this PageElement's children.
 |
 |      This works the same way as `list.insert`.
 |
 |      :param position: The numeric position that should be occupied
 |         in `self.children` by the new PageElement.
 |      :param new_child: A PageElement.
 |
 |  nextGenerator(self)
 |      # Old non-property versions of the generators, for backwards
 |      # compatibility with BS3.
 |
 |  nextSiblingGenerator(self)
 |
 |  parentGenerator(self)
 |
 |  previousGenerator(self)
 |
 |  previousSiblingGenerator(self)
 |
 |  replaceWith = replace_with(self, replace_with)
 |
 |  replaceWithChildren = unwrap(self)
 |
 |  replace_with(self, replace_with)
 |      Replace this PageElement with another one, keeping the rest of the
 |      tree the same.
 |
 |      :param replace_with: A PageElement.
 |      :return: `self`, no longer part of the tree.
 |
 |  replace_with_children = unwrap(self)
 |
 |  setup(self, parent=None, previous_element=None, next_element=None, previous_sib
ling=None, next_sibling=None)
 |      Sets up the initial relations between this element and
 |      other elements.
 |
 |      :param parent: The parent of this element.
 |
 |      :param previous_element: The element parsed immediately before
 |          this one.
 |
 |      :param next_element: The element parsed immediately before
 |          this one.
 |
 |      :param previous_sibling: The most recently encountered element
 |          on the same level of the parse tree as this one.
 |
 |      :param previous_sibling: The next element to be encountered
 |          on the same level of the parse tree as this one.
 |
 |  unwrap(self)
 |      Replace this PageElement with its contents.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  wrap(self, wrap_inside)
 |      Wrap this PageElement inside another one.
 |
 |      :param wrap_inside: A PageElement.
 |      :return: `wrap_inside`, occupying the position in the tree that used
 |         to be occupied by `self`, and with `self` inside it.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.PageElement:
 |
 |  next
 |      The PageElement, if any, that was parsed just after this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  next_elements
 |      All PageElements that were parsed after this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  next_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      later.
 |
 |      :yield: A sequence of PageElements.
 |
 |  parents
 |      All PageElements that are parents of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous
 |      The PageElement, if any, that was parsed just before this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  previous_elements
 |      All PageElements that were parsed before this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      earlier.
 |
 |      :yield: A sequence of PageElements.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.PageElement:
 |
 |  __dict__
 |      dictionary for instance variables (if defined)
 |
 |  __weakref__
 |      list of weak references to the object (if defined)
 |
 |  nextSibling
 |
 |  previousSibling

>>>

beautiful-find_all用法

积累程序-爬虫-beautiful-find_all用法

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
##
##items=soup.find_all(class_="h")
##for i in items:
##    print(i)



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)

request-beautifulsoup区别

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()

'''
soup的数据类型是<class 'bs4.BeautifulSoup'>，说明soup是一个BeautifulSoup对象
打印的soup，是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样，却有着不同的内心，它们属于不同的类：<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串，后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本，是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法，所以直接打印 bs 对象显示字符串是str的返回结果
'''

获取网页中文字并有序展现

爬虫-bs-获取北大网页中的网站和文字-并有序展现

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象


items2=soup.find_all(class_="item")

##for iTag in items2:
##    for i in iTag.find_all():
##        myText=i.get_text()
##        print(myText)
##

for everyTag in items2:
    print(everyTag)

    print()
    print("文字部分")
    myText=everyTag.get_text()
    print(myText)

    print()
    print("链接部分")
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink)
    input()

global

变量作用域
一个在函数内部赋值的变量仅能在该函数内部使用（局部作用域），它们被称作局部变量
在所有函数之外赋值的变量，可以在程序的任何位置使用（全局作用域），它们被称作全局变量
如果想将局部变量声明为全局变量，就要用到global语句

tfc = 1000

def tvc():
    global tvc  # global语句一般写在函数体的第一行，它会告诉Python，“我希望tvc是个全局变量，所以请不要用这个名字创建一个局部变量”
    vc = 200
    x = 10
    tvc = vc * x

def tc():
    print(tfc+tvc)  # tc()函数内部现在可以直接使用声明后的全局变量tvc

tvc()
tc()
# 》》3000

match

import re
m=re.match("hello","hellov world")
if m is not None:
    print(m.group())
    
print(m.__class__.__name__)


m=re.match("bird","bird is flying")
print(m.group())

使用soup.prettify() 有序呈现

import requests
import csv
from bs4 import BeautifulSoup as bs

url="https://www.zhihu.com/follow"

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

webFile= requests.get(url, params=params, headers=headers)
webFile.encoding="utf-8"
data=webFile.text   

soup=bs(data,"html.parser")
print(soup.prettify())

爬取标签

从网页中爬取标签

从超星、维基、知网、阿帕比网站，Ctrl + S 保存网页后，爬取其中的文本目录信息。可以用如下代码实现操作。

myWord="""
[Images]
[Font]
Language=GBK
FontSize=7
Margin=0.5

[Bkmk]
File=FreePic2Pdf_bkmk.txt
AddAsText=0
ShowBkmk=1
ShowAll=1
BasePage=1

[Main]
ContentsPage=
TextPage=
"""
Head='''
首
\t书名页
\t版权页
\t序言
目录

'''

def test():
    htmlName=str(input("请输入网页Wiki CNKI ChoaXing Apabi文件名称："))

    import requests
    from bs4 import BeautifulSoup as bs

    webFile=open(htmlName,"r",encoding="utf-8")
    data=webFile.read()
    webFile.close()

    mysoup=bs(data,"html.parser")
    mysoup.prettify()

    writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")

    print(Head,file=writeFile)

                 
    if "维基文库" in htmlName:
        print("Wiki")
        result=mysoup.find_all("li")
        choice=input("请选择通行A 或 调试T：")
        for i in result:
            myInfo=i.get_text()

            if choice=="A":
                if "卷" in myInfo:
                    mylist=myInfo.split("　")
                    print(mylist[0],file=writeFile)
                    for m in mylist[1:]:
                        print("\t",m,file=writeFile)
            elif choice=="T":
                if "卷" in myInfo:
                    print(myInfo,file=writeFile)
                else:
                    print("\t",myInfo,file=writeFile)
    elif "阿帕比" in htmlName:
        print("Apabi")
        result=mysoup.find_all("li")
        for i in result:
            myInfo=i.get_text()
            for word in "()1234567890页":
                myInfo=myInfo.replace(word,"")

            infoList=myInfo.split(" ")
            if len(infoList)>2:#将单个的对象排除。统一切分处理
                print(infoList[1],file=writeFile)
                for m in infoList[2:]:
                        print("\t",m,file=writeFile)
            elif len(infoList)==2:
                print("\t",myInfo,file=writeFile)
                
        
    elif "中国知网" in htmlName or "CNKI" in htmlName:
        print("CNKI")
        result=mysoup.find_all(attrs={"class":"catalog-listDiv"})
        if len(result)==0:
            result=mysoup.find_all("li")
        
        for i in result:
            myInfo=i.get_text()
            infoline=myInfo.split("    ")
            for line in infoline:
                if "摘要" in line:
                    nline=line.split(" ")
                    for m in nline:
                        print(m,file=writeFile)
                elif "第" in line and  "章" in line and "节" not in line:
                    wline=line.split(" ")
                    print("\t",wline[0],file=writeFile)
                    for m in wline[1:]:
                        print(m,end="",file=writeFile)
                    print("\n",file=writeFile)

     
                elif "结语 参考文献 致谢" in line:
                    nline=line.split(" ")
                    print(nline[0]+nline[1],file=writeFile)
                    for m in nline[2:]:
                        print(m,file=writeFile)
                else:print("\t",line,file=writeFile)

    else:
        print("ChaoXing")
        result=mysoup.find_all("span")

        for i in result:
            if "node_name" in str(i):
                sen=i.get_text()
                sen=sen.lstrip(" ")
                
                if  "第" in str(i) and "章" in str(i):
                    print(sen,file=writeFile)
                elif  "第" in str(i) and "讲" in str(i):
                    print(sen,file=writeFile)
                elif "卷" in str(i) or "论" in str(i) or "编" in str(i):
                    for hz in "一二三四五六七八九十":
                        if hz in str(i):
                            print(sen,file=writeFile)
                            break
                    else:print("\t",sen,file=writeFile)

                else:
                    print("\t",sen,file=writeFile)


    print("尾",file=writeFile)
    writeFile.close()

    itfFile=open("FreePic2Pdf.itf","w",encoding="utf-8")
    print(myWord,file=itfFile)
    itfFile.close()

将超星图书馆网络中书籍爬出来作为书签使用

beautifulsoup ，网页另存为文件myweb.html

'''
<span id="ztree_14_span" class="node_name">季春纪第三</span>
<span id="ztree_11_span" class="node_name">情欲</span>
<span id="ztree_32_span" class="node_name">季夏纪第六</span>
<span id="ztree_12_span" class="node_name">当染</span>


span class="node_name"


'''

import requests
import re
from bs4 import BeautifulSoup as bs

webFile=open("myweb.html","r",encoding="utf-8")
data=webFile.read()
webFile.close()

soup=bs(data,"html.parser")
soup.prettify()


result=soup.find_all("span")
for i  in result:
    if "node_name" in str(i):
        print(i.get_text())

即可完成。

爬取网页步骤解析

以ebook总库（https://ebook.dswxyjy.org.cn/）中的电子文献为例，对过程进行解读。

《建国以来重要文献选编（第一册）》

登录该网站，点击相关按钮，得到该电子书的目录。

然后ctrl+S保存到电脑上。下面写代码，爬取所保存网络文件中的文本信息。

htmlName="建国以来重要文献选编（第一册）.html"

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


mysoup=bs(data,"html.parser")
mysoup.prettify()

print(mysoup.prettify())

运行如下代码，在CMD中运行得到的效果如下（部分）：

left: 56178px;">
       <div class="item left" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); left: 0px; right: auto;">
        <img style='background-color: rgb(255, 255, 255); background-image: url("style/icon/loading.gif"); background-repeat: no-repeat; background-position: center center; width: 118px; height: 170px;'/>
        <p class="title" style="display: none;">
         454
        </p>
       </div>
       <div class="item right" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); right: 0px; left: auto;">
        <img style='background-color: rgb(255, 255, 255); background-image: url("style/icon/loading.gif"); background-repeat: no-repeat; background-position: center center; width: 118px; height: 170px;'/>

这个时候，发现所需要的目录文本没有出现。

但是，如果用记事本打开下载的html文件，就会发现这些目录文本存在。如下所示：

<li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: rgba(255, 255, 255, 0.1); font-weight: bold;"><img class="arrow" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAAXNSR0IArs4c6QAAADtJREFUKFNjZCASMBKpjoE0hf+Py2RGRkawYXATsSmGKUJRCOIgK0ZWhKEQphhdEVaFON1K9eABAFosEAvuw3jXAAAAAElFTkSuQmCC" style="left: 0px;"><p class="description" style="margin-left: 15px;">《建国以来重要文献选编》（第一册）</p></li><ul class="itemList"><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中国人民政治协商会议共同纲领（一九四九年九月二十九日中国人民政治协商会议第一届全体会议通过）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">人民政协共同纲领草案的特点（一九四九年九月二十二日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中华人民共和国中央人民政府公告（一九四九年十月一日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中国人民解放军总部命令（一九四九年十月一日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中共中央关于少数民族“自决权”问题给二野前委的指示（一九四九年十月五日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">新华总社关于土改后农村阶级划分问题给东北总分社的复电（一九四九年十月十一日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">关于学习松江县召开各界人民代表会议经验的指示（一九四九年十月十三日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">毛泽东批转薄一波《关于华北各城市召开各界代表会议的情形和经验的报告》（一九四九年十月三十日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><p class="description" style="margin-left: 25px;">制止物价猛涨（一九四九年十一月十三日）</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><p class="description" style="margin-left: 25px;">关于大量吸收和培养少数民族干部的指示（一九四九年十一月十四日）</p></li>

针对这种情况，需要重新想新的策略。

实际上，依据打开的txt文本，发现各个标题是在如下的形式中：

<p class="description" style="margin-left: 25px;">中央人民政府委员会关于发行人民胜利折实公债的决定

所以，执行如下代码：mysoup.find_all(attrs={"class":"description"})

htmlName="建国以来重要文献选编（第一册）.html"

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()



mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo)

这个时候，就会把这些文本信息给提取出来。经过调试，完整代码如下：

htmlName="建国以来重要文献选编（第四册）.html"

Head='''首\n\t书名页\n\t版权页\n\t序言\n目录'''

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
print(Head,file=writeFile)

mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result[1:]:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo,file=writeFile)

writeFile.close()

最后，形成的完整代码如下：

#myexample="建国以来重要文献选编（第四册）.html"

htmlName=input("请输入社科院ebook网页文件名称：")
Head='''首\n\t书名页\n\t版权页\n\t序言\n目录'''

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
print(Head,file=writeFile)

mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result[1:]:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo,file=writeFile)

writeFile.close()
print("已输出完毕。")

如此，即可完成爬取任务。

从文本中获取标签

可以使用如下代码：

def test():
    import re
    pattern="“.*?[。？：；”]"

    fileName=input("选择句子开头作为标签，请输入文本名称：")#说文解字，尔雅

    part=input("请输入1或2个区分层级关键词{第部章卷...}：")
    if len(part)==1:
        a=part
        b=part
    elif len(part)==2:
        a=part[0]
        b=part[1]
    choice="L"
    choice=input("文本对话选L；Wiki目录选W；开头首字母选S；开头前面句子选E：")
    choice=choice.upper()

    file=open(fileName,"r",encoding="utf-8")
    data=file.read()
    file.close()

    data=data.replace("编辑","")
    datalines=data.splitlines()

    def ShuoWen():
        #说文
        for line in datalines:
            for word in line:
                if word in "（（ ）0123456789：↑":
                    break
            print("\t",word,file=wfile)
    def ErYa():
        for line in datalines:
            if part in line:
                print(line,file=wfile)
            else:print("\t",line[:5],file=wfile)
    def Wiki():
        for line in datalines:
            if part in line and len(line)<=4 and len(line)>=2:
                print(line,file=wfile)
            elif "↑" in line or "◄" in line or "►" in line or " 註釋"  in line:pass
            elif len(line)>=2 and len(line)<=10:
                print("\t",line,file=wfile)            
    def LunYu():
        zhang=0
        
        jieming=0
        for line in datalines:
            if a in line and b in line:
                print(line,file=wfile)
                zhang+=1
                jieming=1

            if a not in line and b not in line and len(line)>4:#【经验】if ...if...和if ... else...不同。前者是单线，后者是双线。
                result=re.compile(pattern).findall(line)
                print("\t",f"{zhang}.{jieming}",end="",file=wfile)

                if len(result)!=0:#选择引号内的句子。
                    jieming+=1
                    n=0
                    for i in result:
                        i=i.lstrip("“")
                        print(i,file=wfile)
                        n+=1
                        if n==1:
                           break                    
                else:#没有引号则选择开头句子
                    jieming+=1
                    for w in line:
                        print(w,end="",file=wfile)
                        if w in "：。；":
                            break
                print("\n",file=wfile)
            
            

    wfile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
    if choice=="S":
        ShuoWen()
    elif choice=="E":
        ErYa()
    elif choice=="W":
        Wiki()
    elif choice=="L":
        LunYu()
    wfile.close()
    print("已经完成")

即可实现。

爬取文本

爬虫实践从wiki中下载文本

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)

    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息，则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()

    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")

对于代码，还可以进一步优化。

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]#需要依据实际情况调整章节的网络链接格式
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)#有的网站是“卷01”，不按照链接体现的格式。这个就得调整程序了。
    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息，则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()


    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")
test()

爬虫实践从zdic中下载文本

import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re 

def test():
    a=int(input("请输入汉典网页起始页码："))
    b=int(input("请输入汉典网页终止页码："))

    myName=input("请输入目标文件名：")

    startTime=time.time()
    HouZhui=".docx"
    resultName=myName+HouZhui




    urlList=[]

    for i in range(a,b+1):
        webUrl="https://gj.zdic.net/archive.php?aid-"+str(i)+".html"
        urlList.append(webUrl)

    zongShu=len(urlList)
    n=1


    writeFile=open(resultName,"w",encoding="utf-8")

    for webUrl in urlList:
        webfile=requests.get(webUrl)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print("\n",file=writeFile)
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(attrs={"id":"snr2"})
        art=str(result)
        artlines=art.splitlines()
        article=artlines[0][17:]
        article=article.replace("<br/>","s")
        for i in article:
            
            if i=="s":
                print("\n",file=writeFile)
                print("\t",file=writeFile)
            else:print(i,end="",sep="",file=writeFile)
        print("……",file=writeFile)
        print("\n",file=writeFile)
        time.sleep(0.05+random.randint(0,2))
        percent=float(n/zongShu)
        print(f"第{n}页已完成，共计{zongShu}页,完成度","{0:4.2}".format(percent))
        n+=1
    writeFile.close()
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")

爬虫实践从ctext 中下载文本

从ctext 中下载文本。可以用到ctext包。

ctext 相关说明如下

https://pypi.org/project/ctext/

下面以《论语》为例，说明如何下载。

代码如下：

from ctext import *
setapikey("your-api-key-goes-here")
setlanguage("zh")

stats = getstats()
status = getstatus()
titles = gettexttitles()
capabilities = getcapabilities()

urn = readlink("https://ctext.org/analects")#以论语为例

passages = gettext("ctp:analects/xue-er")
print(passages)

又有如下程序，亦可以实现功能。

def test():
    '''
    https://ctext.org/wiki.pl?if=gb&chapter=868712
    https://ctext.org/wiki.pl?if=gb&chapter=969206
    webUrl="https://ctext.org/wiki.pl?if=gb&res=970278
    '''
    import requests
    from bs4 import BeautifulSoup as bs

    import time
    import random

    headers={}#建立字典
    user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
                    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
                    ]



    startTime=time.time()
    webUrl=input("请输入书本在Ctext中目录所在网址：")#目录页所在编码，可以获得每章的链接
    ##webUrl="https://ctext.org/wiki.pl?if=gb&res=642006"
    startPage=int(input("请输入目录列表中所求链接的序列数字："))
    webfile=requests.get(webUrl)
    webfile.encoding="utf-8"
    data=webfile.text

    obs=bs(data,"html.parser")
    obs.prettify()


    result=obs.find_all("a")
    Name=obs.h2
    nameStr=Name.get_text()
    nameList=nameStr.split("[")
    resultName=nameList[0]




    urlList=[]
    for i in result:
        if "wiki.pl?" and "卷" in str(i):
            url=list(i.attrs.values())

            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)
        elif "wiki.pl?" and "序" in str(i):
            url=list(i.attrs.values())
            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)


    numList=[str(i) for i in range(0,10)]

    zongShu=len(urlList)


    n=0
    writeFile=open(f"{resultName}_FromCtext.txt","a+",encoding="utf-8")

    start=startPage-1
    for webUrl in urlList[start:]:#列表从0开始
        headers['User-Agent']= random.choice(user_agent_list)

        print(webUrl)
        webfile=requests.get(webUrl,headers=headers)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(class_="ctext")
        for i in result:
            myStr=i.get_text()
            for num in numList:
                myStr=myStr.replace(num,"")
            print(myStr,file=writeFile)
        n+=1
            

        time.sleep(3+random.randint(0,3))
        percent=float((n+start)/zongShu)
        print(f"第{n+start}页已完成，共计{zongShu}页,完成度","{0:4.2}".format(percent))

        
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")
    writeFile.close()

三者结合

想要使得三者结合，有如下代码：

'''
https://gj.zdic.net/archive.php?aid-6679.html


'''
def test():
    webChoice=input("汉典:Z；维基:W；哲学电子书:C。请输入选择：")
    webChoice=webChoice.upper()
    if webChoice=="Z":
        import BsFromZdic
        BsFromZdic.test()
    elif webChoice=="W":
        import BsFromWik
        BsFromWik.test()
    elif webChoice=="C":
        import BsFromCtext
        BsFromCtext.test()

爬取图片

贴吧中的动画图片

《虹猫蓝兔七侠传》是一部非常不错的动画片，后续还有漫画版的前传和后传。百度贴吧中，有这样一系列图片，现在想把图片爬下来，合成PDF便于阅读。写如下代码：

'''
<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=fbff
fefc1f950a7b75354ecc3ad3625c/4c5fa44bd11373f035f5ca55a60f4bfbf9ed04ca.jpg" pic_ext="jpeg" pic_type="0"
width="560" height="388">

<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=efda23249e82d158bb8259b9b00
819d5/acb1c2ef76094b36927cbe27a1cc7cd98c109d2e.jpg"
pic_ext="jpeg" pic_type="0" width="560" height="426">

<img class="image_original_original"
style="z-index: 2; width: 585.176px; height: 450px; top: 0px; left: 75.9122px;"
src="http://imgsrc.baidu.com/forum/pic/item/f82405d7912397dd928f3cce5b82b2b7d1a28726.jpg">

'''



urlList=["http://tieba.baidu.com/p/3175345087",
         "http://tieba.baidu.com/p/3175362317",
         "http://tieba.baidu.com/p/3175373350",
         "http://tieba.baidu.com/p/3175383386",
        "http://tieba.baidu.com/p/3175393635",
        "http://tieba.baidu.com/p/3175402697",]

import urllib.request
import re

zhang=1
for webUrl in urlList:

    i=1
    htmll=urllib.request.urlopen(webUrl).read()

    data=str(htmll)


    pattern='''img class="image_original_original" src=.(.+?\.jpg)"'''
    result=re.compile(pattern).findall(data)

    for imageUrl in result:
        print(imageUrl)
##        print(imageUrl)
##        imageName=str(zhang)+"-"+str(i)+".jpg"
##        i=i+1
##        urllib.request.urlretrieve(imageUrl,filename=imageName)
##    zhang=zhang+1
##    
    print()

注意：爬取后的图片，有模糊图和高清图两种，名称并不一样。高清图的位置不在页面中而是在别处，需要修正一下网络链接才能爬取。

那么，能否从中找到共性，写成通用的代码呢？于是做了如下尝试：

def test():
    '''
    http://imgsrc.baidu.com/forum/pic/item/e69597510fb30f24ebcb4ec9ca95d143ac4b0347.jpg
    http://imgsrc.baidu.com/forum/pic/item/4c0f7af082025aaf165fdc01f9edab64024f1aa3.jpg

    '''
    import urllib.request
    import requests
    from bs4 import BeautifulSoup as bs
    import re
    print("每个网站的情况并不一致，借鉴此程序后，重新写代码为宜。")
    mychoice=input("是否继续 Y or N：")
    if mychoice=="Y":
        pass
    else:
        exit()
        
    print("如为避免遗漏而需下载网页，请复制网页代码到web.html并输入D。")
    print("如在网上运行，请输入W。")  
    choice=input("Download or Web：")


    webUrlList=[]
    while True:
        webUrl=input("请输入要下载图片所在的完整网站：")
        webUrlList.append(webUrl)
        webChoice=input("是否继续输入网站，Y or N：")
        if webChoice=="N":
            break

    ##webUrl="https://baike.baidu.com/pic/黑小虎传奇/4659511"#点击进入百度黑小虎传奇图册。
    adjust=input("是否需要调整高清图，Y or N：")

    classImage=str(input("请输入obs寻找到的class类别："))

    pattern='src="..*?"'
    
    zhang=1

    if choice=="D" and adjust=="N":
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all(attrs={"class":classImage})
        n=1
        for i in result:
            myLink=re.findall(pattern,str(i))
            bLink=str(myLink[0])
            print(bLink)
            imageName="图"+str(n)+".jpg"
            urllib.request.urlretrieve(bLink,filename=imageName)
            n+=1
        zhang+=1

    elif choice=="D" and adjust=="Y":

        addLink="watermark,image_d2F0ZXIvYmFpa2UxODA=,g_7,xp_5,yp_5/format,f_auto"
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all("img")
        n=1
        
        for i in result:
            try:
        ##        print(i)
                myLink=re.findall(pattern,str(i))

                aLink=myLink[0]
                aList=aLink.split("/")
                aLink=aList[2][:-1]#需要依据实际情况不断调整。
        ##        print(aList)
                bLink=f"https://bkimg.cdn.bcebos.com/pic/{aLink}?x-bce-process=image/{addLink}"
        ####        bLink=aList[-1]#通过观察，找到更为清晰的图片链接。
                print(bLink)

                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            except:pass
        zhang+=1


    elif choice=="W" and adjust=="Y":
        addLink=input("请根据情况输入图片网址的前半部分：")
        #"http://imgsrc.baidu.com/forum/pic/item/"
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})
            n=1
            for i in result:
                print(i)
                myLink=re.findall(pattern,str(i))#bs是用find_all,而re使用findall
                print(myLink)
                aLink=myLink[0]
                aList=aLink.split("/")
                bLink=addLink+aList[-1]#通过观察，找到更为清晰的图片链接。
                print(bLink)
                imageName=str(zhang)+"图"+str(n)+".jpg"        
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    elif choice=="W" and adjust=="N":
        zhang=1
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})

            n=1
            for i in result:
                myLink=re.findall(pattern,str(i))
                bLink=str(myLink[0])
                print(bLink)
                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    else:print("未能导出图片，请进一步完善程序。")

网站中的地理图片

国家地理网站中，有一些图片也可以进行爬取。


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("nationalgeographic.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)
    #<img alt="火山口" src="http://image.ngchina.com.cn/2019/1104/20191104100458321.jpg">

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL：{}".format(count, url),file=f)
        print("第{}个URL：{}".format(count, url))
        count += 1
    f.close()

showResults()

注意，爬取时候要获得图片的网页链接


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("file1.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL：{}".format(count, url),file=f)
        print("第{}个URL：{}".format(count, url))
        count += 1
    f.close()

showResults()

网站中的人物图片

在学习爬虫的时候，借鉴了网友的代码，如下所示，其中在运行过程中，发现会有bug需要修正。在一步一步运行代码并修正的时候，也对爬虫有了更深入的了解。

import requests

url="http://www.runoob.com"


header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
#设置headers，网站会根据这个判断你的浏览器及操作系统，很多网站没有此信息将拒绝你访问
#用get方法打开url并发送headers
html = requests.get(url,headers = header)
#print(html.text)


#提取所需要的信息
##将获取的源码转换为BeautifulSoup对象
##使用find搜索需要的数据，保存到容器中
from bs4 import BeautifulSoup

url='http://www.mzitu.com'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html=requests.get(url,headers=header)
#print(html.text)

 
#使用自带的html.parser解析，速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')


#寻找div中的所有a
all_a=soup.find("div",class_="postlist").find_all("a",target="_blank")
##for a in all_a:
##    title=a.get_text()#提取文本
##    print(title)
##
##
##all_div=soup.find("div",class_="postlist")
##for i in all_div:
##    tmp=i.get_text()
##    print(tmp)
#find 返回类型和find_all返回类型不同，find_all才能用get_text()

##page = soup.find_all('a', class_='page-numbers')
##max_page = page[-2].text
###print(max_page)

picture=soup.find("div",class_='postlist').find_all("a",target="_blank")
for everylink in picture:
    #print(everylink)
    
    tmp=everylink.attrs
    #print(tmp)
    
    mytxt=everylink.get_text()
    
    if "href" in everylink.attrs:
        print(f"href={everylink.attrs['href']}",sep="\t")
    
#print(picture)


# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
##same_url = 'https://www.mzitu.com/mm/page/'
##
## 
##for n in range(1, int(max_page) + 1):
##    ul = same_url + str(n)
##    #print(ul)
##    # 分别对当前类每一页第一层url发起请求
##    start_html = requests.get(ul, headers=header)
##    # 提取所有MM的标题
##    soup = BeautifulSoup(start_html.text, "html.parser")
##    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
##    #print(all_a)
##    
##    # 遍历所有MM的标题
##    for a in all_a:
##        # 提取标题文本，作为文件夹名称
##        title = a.get_text()
##        print(title)
##        if(title != ''):
##            print("准备扒取：" + title)
##            if(oa.path.exists(path+title.strip()))
##            
## 
##

于是，经过试错，不断修正，完善为如下代码：

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
 
all_url = 'https://www.mzitu.com'
 
# http请求头
Hostreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://www.mzitu.com'
}
# 此请求头Referer破解盗图链接
Picreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://i.meizitu.net'
}
 
# 对mzitu主页all_url发起请求，将返回的HTML数据保存，便于解析
start_html = requests.get(all_url, headers=Hostreferer)
 
# Linux保存地址
# path = '/home/Nick/Desktop/mzitu/'
 
# Windows保存地址
path = 'E:/mzitu/'
 
# 获取最大页数
soup = BeautifulSoup(start_html.text, "html.parser")
page = soup.find_all('a', class_='page-numbers')
max_page = page[-2].text
 
 
# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
same_url = 'https://www.mzitu.com/mm/page/'     # 也可以指定《qingchun MM系列》
 
for n in range(1, int(max_page) + 1):
    # 拼接当前类MM的所有url
    ul = same_url + str(n)
 
    # 分别对当前类每一页第一层url发起请求
    start_html = requests.get(ul, headers=Hostreferer)
 
    # 提取所有MM的标题
    soup = BeautifulSoup(start_html.text, "html.parser")
    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
 
    # 遍历所有MM的标题
    for a in all_a:
        # 提取标题文本，作为文件夹名称
        title = a.get_text()
        if(title != ''):
            print("准备扒取：" + title)
 
            # windows不能创建带？的目录，添加判断逻辑
            if(os.path.exists(path + title.strip().replace('?', ''))):
                # print('目录已存在')
                flag = 1
            else:
                os.makedirs(path + title.strip().replace('?', ''))
                flag = 0
            # 切换到上一步创建的目录
            os.chdir(path + title.strip().replace('?', ''))
 
            # 提取第一层每一个MM的url，并发起请求
            href = a['href']
            html = requests.get(href, headers=Hostreferer)
            mess = BeautifulSoup(html.text, "html.parser")
 
            # 获取第二层最大页数
            pic_max = mess.find_all('span')
            pic_max = pic_max[9].text
            if(flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):
                print('已经保存完毕，跳过')
                continue
 
            # 遍历第二层每张图片的url
            for num in range(1, int(pic_max) + 1):
                # 拼接每张图片的url
                pic = href + '/' + str(num)
 
                # 发起请求
                html = requests.get(pic, headers=Hostreferer)
                mess = BeautifulSoup(html.text, "html.parser")
                pic_url = mess.find('img', alt=title)
                print(pic_url['src'])
                html = requests.get(pic_url['src'], headers=Picreferer)
 
                # 提取图片名字
                file_name = pic_url['src'].split(r'/')[-1]
 
                # 保存图片
                f = open(file_name, 'wb')
                f.write(html.content)
                f.close()
            print('完成')
    print('第', n, '页完成')

个人图书馆中的学习图片

在360doc中，有些图片很利于学习，如何爬取呢？

写如下代码：

import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import re

 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
 
    'limit': '20',
    'sort_by': 'created'
}

 
webUrl="http://www.360doc.com/showweb/0/0/1104723360.aspx"

webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
print(data)

运行后，返回效果如下

<html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>

推测原因，是你需要登录账号，才可以查看。

403 Forbidden是HTTP协议中的一个状态码(Status Code)。可以简单的理解为没有权限访问此站。该状态表示服务器理解了本次请求但是拒绝执行该任务，该请求不该重发给服务器。

既然这样，直接Ctrl + S，保存该网页所有信息。该网页图片也会全部保存下来，反而更有效率。

【心得】

附：生成图片

当然，除了爬取图片，在python重要的还是生成图片。

如下代码：


import turtle as t

color = ['red','pink','green']
ra = [20, 50, 100]
for i in range(2):
    t.pu()
    t.goto(0,0)
    t.pd()
    t.pencolor(color[i])
    t.circle(ra[i])
t.done()

生成图片如下：

关键点在于小海龟要去的起始位置。需要调整t.goto()这个函数中的参数。

修改代码如下：


import turtle as t

color = ['red','pink','green']
ra = [20, 50, 100]
for i in range(3):
    t.pu()
    t.goto(0,ra[i])
    t.pd()
    t.pencolor(color[i])
    t.circle(ra[i])
t.done()

图片如下：

修改代码如下：


import turtle as t

color = ['red','pink','green']
ra = [20, 50, 100]
for i in range(3):
    t.pu()
    t.goto(0,-ra[i])
    t.pd()
    t.pencolor(color[i])
    t.circle(ra[i])
t.done()

生成图形如下：

即可完成。

爬取综合信息

网站中的邮箱号码

对于网易中的邮箱号码，也可以进行爬取。



        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息，便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息，区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分，留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
Gupiao()

     
def Ceyan():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))
    print(mycontent[:500])
    print()
    
    myinfor=mysoup.find_all("a")#<a href="http://www.163.com/">网易首页</a>,寻找的是属性，如a,如td,如tr，head,body,
    for i in myinfor:
        tmp=i.get_text()
        print(tmp)
        print(i)
        print()
        print(i.prettify())#用不同的语法格式，看看输出的效果如何。然后就知道各个语句的用法何在。prettify的作用是把密密麻麻的一行输出为整齐的几行，便于阅读。
    #print(myinfor)

Ceyan()

在实践中，也会遇到问题。可以调整代码解决：

from bs4 import BeautifulSoup
import time

htmlFile=open("stock1.html","r",encoding="utf-8")
htmlContent=htmlFile.read()

#time.sleep(10) #暂停10秒

myBS=BeautifulSoup(htmlContent,"html.parser")
#print(myBS)
myLinks=myBS.find_all("a")
#print(myLinks)

for everyLink in myLinks:
	myText=everyLink.get_text()
	#print(myText)

	if "163.com" not in myText:
		print("test")
		print(myText)
		if "href" in everyLink.attrs:#属性attrs
			print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
			#print(myText,":href=",everyLink.attrs['href'],sep="\t")


'''
myBs=BeautifulSoup(htmlcontent,''html.parser'')
mylinks=myBs.find_all('a')

for everylink in mylinks:
	mytext=everylink.get_text()
	if '163.com' not in mytext:
		if"href" in everylink.attrs:
			print(mylink)

问题：为什么163.com还会出现呢？
运行结果：
网易首页:href=http://www.163.com/
新闻:href=http://news.163.com/
体育:href=http://sports.163.com/
NBA:href=http://sports.163.com/nba/
娱乐:href=http://ent.163.com/
财经:href=http://money.163.com/
股票:href=http://money.163.com/stock/
汽车:href=http://auto.163.com/
科技:href=http://tech.163.com/
'''

爬取之后，有序输出。

不同网址

代码如下：


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("file1.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    #print(ls)

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
           
            urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL：{}".format(count, url),file=f)
        print("第{}个URL：{}".format(count, url))
        count += 1
    f.close()

showResults()

即可完成。

证券指数

对于上海股市的表格，代码如下：



f=open("E:\\编程\\已有成果展示\\实践\\网络提取信息\\采辑网页数据示例\\上海股市价格\\yuanDaiMaShangHai-Stock.htm","r",encoding="utf-8")
ls=f.readlines()
f.close()
#print(ls)


def BiaoTou():
    #提取表头信息
    newlist=[]
    newlist2=[]
    newlist3=[]
    newlist4=[]
    newlist6=[]
    for line in ls:
        if '<th>' and '</th>' in line:
            newlist.append(line)#筛选出所需要的信息。添加到列表。


    for everyline in newlist:
        #print(everyline)
        itline=everyline.split()#返回为列表。这个列表去除了空格。列表中的元素时列表。
        newlist2.append(itline)


    for i in newlist2:
        #print(i[0])#把列表中的元素返回为字符串。便于操作。
        newlist3.append(i[0])
    #print(newlist3)

    for i in newlist3:
        tmp=i.strip("<th>").strip('</th>')#去除了列表中元素字符串的的不必要的元素。
        newlist4.append(tmp)
        #print(tmp)
    #print(newlist4)
    wordlist=newlist4
    #print(wordlist)
    return wordlist

def ShuJu():

    #提取数据信息
    newlist=[]
    newlist2=[]
    newlist3=[]
    newlist4=[]
    newlist5=[]
    newlist6=[]
    for line in ls:
        if '<td>' and '</td>' in line: #<td>20190930</td>
            newlist.append(line)#筛选出所需要的信息。添加到列表。
    #print(newlist)

    for everyline in newlist:
        #print(everyline)
        itline=everyline.split()#返回为列表。这个列表去除了空格。列表中的元素时列表。
        newlist2.append(itline)
    #print(newlist2)


    for i in newlist2:
        #print(i[0])#把列表中的元素返回为字符串。便于操作。
        newlist3.append(i[0])
    #print(newlist3)

    for i in newlist3:
        tmp=i.strip("d>").strip('</td').strip('>').strip('h>').strip('</th')#去除了列表中元素字符串的的不必要的元素。
        newlist4.append(tmp)
        #print(tmp)
    #print(newlist4)

    for i in newlist4:
        if i!= '' and i!="a" and i!='!--<td><a':
            newlist5.append(i)
    #print(newlist5)

    listword=['日期', '开盘价', '最高价', '最低价', '收盘价', '涨跌额', '涨跌幅(%)', '成交量(股)', '成交金额(元)']
    for i in newlist5:
        if i not in listword:
            newlist6.append(i)
    #print(newlist6)
    return newlist6

def Output():
    wfile=open("result.txt",'w',encoding="utf-8")
    wordlist=BiaoTou()
    numlist=ShuJu()
    for i in wordlist:
        print(i.center(26," "),end="",file=wfile)
    print(file=wfile)
    count=0
    for i in numlist:
        print(i.center(26," "),end='',file=wfile)
        count+=1
        if count==9:
            print(file=wfile)
            count=0#这个程序很重要，不容易想到，一定要记住。
    wfile.close()

Output()

#讲义的内容一定通看一遍。知道他的存在。通看！！！否则会自己去写这个函数，浪费更多时间。就是去调动函数。
#循环是一个主要问题。控制好循环，程序设计就做完了。
# 公理就那么几条。算法和数据结构就那么几个。

打开网络文件，如下代码：

file=open("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=2019&season=3","html.parse",encoding='utf-8')
f=file
file.close()
print(f)

详细代码如下：

'''
HTML: hypertxt markup language:超文本标记语言
superman,人的概念，有力量超人
hyperman,超越人的概念，已经不是人。

在hypertxt下面，是链接：link,href.
有标准，那么就能通用。规范的工具。

<a herf='http://www.163.com'> </a>是最基本的元素。
把这些网站目录搜集在一起，就是yahoo和souhu等干的事情。

baidu，google的厉害之处在于，他们有爬虫，无休无止地寻找网页
每一个词，就是一个字典，这个字典里面存在百万个链接。
北京大学，蔡元培，两个词，就是两个字典。
但是也有问题：如“武汉市长江大桥”
出现歧义。这是就需要概率论的知识，让电脑判断。

sougou 的词经常用电脑更新，所以越来越好用

网页就是数据源，提供数据的地方。
既然是标准的，一般说来就有通用的工具。
初次认识BeautifulSoup

网页，树状结构。看起来比较复杂。word的
用了BeautifulSoup，打开后是层次结构。非常清晰。

<a herf=" www....." sytle=' .......'>北京大学<\a>
herf，style，表示a元素的属性
'''


from bs4 import BeautifulSoup
f=open("E:\\编程\\已有成果展示\\实践\\网络提取信息\\采辑网页数据示例\\上海股市价格\\yuanDaiMaShangHai-Stock.htm","r",encoding="utf-8")
htmlcontent=f.read()
f.close()
#print(htmlcontent[:100])

myBS=BeautifulSoup(htmlcontent,"html.parser")
#print(myBS.prettify())#多个表格输出一个

#print(myBS.table)  #<table class="main_nav">  </table>

mytables=myBS.find_all('table')#只输出一个

#print(myBS.find_all('table'))

for everytable in mytables:
    #print(everytable)

    if 'table_bg001'in str(everytable):#进行转化。成为字符串
        datatable=everytable
        #print(everytable)
    input()   #input 把datatable输出
myTrs=datatable.find_all("tr")
#print(myTrs)

for everyTr in myTrs[1:]:
    myTDs=everyTr.find_all("td")#字符串没有这个功能.利于分析网页的结构。
    print(everyTr.prettify())
    input()#你按下enter键，然后才能输入。方便你一个一个看。不用一次性全部输出给你
    for everyTD in myTDs:
        print(everyTD.get_text(),end="  ")#获得元素里面的文本。
    print()

又有如下代码：



# #网络爬虫
# myBS=BeautifulSoup(htmlcontent,"html.parser")
# mylinks=myBS.find_all("a")
# for everylink in mylinks:
#     mytext=everylink.get_text()
#     if "163.com" not in mytext:
#         print(mytext.everylink.attrs)
#         if 'href' in mytext.everylink.attrs:
#             print(mytext.everylink.attrs['href'],sep='\t')#此处待考。
# #print(myBS.prettify())#多个表格输出一个


#如果爬虫重复去做某件事情，就能怕多个东西了，
#还有反爬虫技术。商业网站常这样。
import time

import requests

import random

time.sleep(60+random.randint(0,59))#会暂停听十秒，有一定的时间间隔/
#这样的话，过一段实际就可以采集一次。

#year='str(year)‘#网页中
webFile=open('stock_'+year+'_3html',"w",encoding="utf-8")
#可以采集每年的数据。炒股的时候，就用机器人炒，估计每年的趋势。
webFile=open(f'stock_{year}_3html',"w",encoding="utf-8")

'''
#f'sting'的用法
#用花括号把变量括起来，可以更加省事。
#爬虫，
网易新闻|敦读：团结的村在哪里呢？王家的村在哪里呢？
我们分析67万个村名，用大数据找到中国地名秘密
中国国家统计局数据。

甚至企业的名字
建国前的名字
建国后的名字
这些数据拿来，用python
我们会发现一系列深刻的变化
地名的对比

比如网络cbdb哈佛大学历代人物数据
比如把建国以来的年谱搜集起来。你会发现帮派，山头，人际关系，斗争之后谁最惨，这些肯定能够看得出来。

'''

此前的一个尝试代码

#
#E:\\编程\\已有成果展示\\实践\\网络提取信息\\采辑网页数据示例\\上证指数用bs\\
# file=open('001.html',"r",encoding= "utf-8")
# ls=file.read()
# file.close()
# print(ls)



from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#这是一个网络文件。html_doc
#soup.find()函数
# 创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html_doc, "html.parser", from_encoding="utf-8")#parser.parse();  解析整个页面.解析网页

# 获取所有的链接

links = soup.find_all('a')
#print(links)
# print("所有的链接")
# for link in links:
#     print (link['class'])#get_text()获得的是<a>与<\a>之间的内容,列表中红内容也是<a>与<\a>之间的内容。


# links = soup.find_all('a')
# print("所有的链接")
# for link in links:
#     print (link.name, link['href'], link.get_text())
#
# print("获取特定的URL地址")
# link_node = soup.find('a', href="http://example.com/elsie")
# print(link_node.name, link_node['href'], link_node['class'], link_node.get_text())
# #
# print("正则表达式匹配")
# link_node = soup.find('a', href=re.compile(r"ti"))
# print (link_node.name, link_node['href'], link_node['class'], link_node.get_text())
# #
print("获取P段落的文字")
p_node = soup.find('p', class_='story')
print (p_node.name, p_node['class'], p_node.get_text())
#输出了nce upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

即可完成。

炒股利器

代码如下：

from bs4 import BeautifulSoup

htmlFile=open("stock.html","r",encoding="utf-8")
htmlContent=htmlFile.read()

#print(htmlContent[:200])

myBS=BeautifulSoup(htmlContent,"html.parser")
#print(myBS.prettify())

#print(myBS.prettify())
##tmp=list(myBS)
##print(tmp[:2])
##print(tmp[:4])
#print(myBS.table) #多个表格但仅输出一个


#print(myBS.find_all("table"))

myTables=myBS.find_all("table")
for everyTab in myTables:
	if "table_bg001" in str(everyTab):#网页中table class="table_bg001 border_box limit_sale"
		dataTab=everyTab
##mynum=dataTab.find_all('td')
##print(mynum)
#print(dataTab)
myTrs=dataTab.find_all("tr")#tr,table row
#print(myTrs)


for everyTr in myTrs[1:]:
	myTDs=everyTr.find_all("td")
	#print(everyTr.prettify())
	#input()
	for everyTD in myTDs:
		print(everyTD.get_text(),end="  ")#一个空格键没有输入好，计算机就运行不了。这是深刻教训。
	print()

又如下：

import requests
import time
import random

count=2016
for year in range(2016,2020):
        
        
	r = requests.get('http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year='+str(year)+"&season=3")
	#取得http://www.pku.edu.cn网址的内容.而且，由于年份月份在百年，r取得的网页也在变化。

	r.encoding="utf-8" #设置网页编码，此处一般为utf-8

	print(r.text) #输出网页文本，r.text结果为字符串

	
	
	print("OK,这是{}年网页".format(count))
	count+=1
	input()

	wbFile = open(f"stock_{year}+_3.html", "w", encoding="utf-8")

	print(r.text,file=wbFile)
	
	#time.sleep(60+random.randint(0,59)) #暂停10秒

       
        
         #webFile.close()

 
#eof

##m=requests.get('http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=2019'+"&season=3")
##m.encoding="utf-8"
##tmp=list(m)
##print(tmp[:1])

新的代码如下：

def wangye():
	import requests
	import time
	import random

	for year in range(2016,2020):
		for season in range(1,5):

			r = requests.get('http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year='+str(year)+"&season="+str(season)) #取得http://www.pku.edu.cn网址的内容
			r.encoding="utf-8" #设置网页编码，此处一般为utf-8

			webFile=open(f"stock_{year}+_{season}.html","w",encoding="utf-8")#在这个位置，可以输出多个文件。在for之前，只能输出一个.
			#time.sleep(60+random.randint(0,59)) #暂停10秒

			print(r.text,file=webFile)#输出网页文本，r.text结果为字符串
			print(r.text)
			print("这是{}年第{}季度的文件".format(year,season))
			input()

			webFile.close()
	return r
#wangye()

def xinxi():
	from bs4 import BeautifulSoup
	wefile = open("result.txt", "w", encoding="utf-8")

	for year in range(2016,2020):
		for season in range(1,5):
			htmlFile=open(f"stock_{year}+_{season}.html","r",encoding="utf-8")#f的作用是可以使得文件名成为变量。
			htmlContent=htmlFile.read()
			#print(htmlContent[:100])
			htmlFile.close()


			myBS=BeautifulSoup(htmlContent,"html.parser")
			#print(myBS.prettify())

			myTabs=myBS.find_all("table")
			#print(myTabs)

			for everyTab in myTabs:
				if "table_bg001 border_box limit_sale" in str(everyTab):
					dataTab=everyTab
			myTrs=dataTab.find_all("tr")
			#print(myTrs)

			for everyTr in myTrs[1:]:
				#print(everyTr.prettify())

				myTDs=everyTr.find_all("td")
				for everynum in myTDs:
					everynum2=everynum.get_text()
					a=float(everynum2.replace(',', ''))
					print(a,end="  ",file=wefile)
				print(file=wefile)
			print(file=wefile)
		print(file=wefile)
		wefile.close
xinxi()

'''
但是，不要机械地模仿老师，没有必要再机械地存储一次html,除非你特异为之。
因为这会会占用存储空间。

'''

即可完成。

网站中的大学信息

读取网络文本


import requests
webFile=requests.get("http://www.pku.edu.cn")
webFile.encoding="utf-8"
webFile=webFile.text
print(webFile)

解析网页

import requests
response=requests.get('https://www.pku.edu.cn')
mycode=response.status_code
mycontent=response.content

分析所爬内容


with open(r"E:\pkuCode.txt","r",encoding="utf-8") as myFile:
    data=myFile.readlines()

myList=list(data)
for i in myList:
    print(i)
    input()

解析对象

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
soup.prettify()

items2=soup.find_all(class_="item")


myFile=open(r"E:\mySchoolLink.txt","w",encoding="utf-8")

for everyTag in items2:
    #print(everyTag)

    print(file=myFile)
    print("文字部分",file=myFile)
    myText=everyTag.get_text()
    print(myText,file=myFile)

    print(file=myFile)
    print("链接部分",file=myFile)
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink.attrs,file=myFile)

myFile.close()

可以用requests ,将对象存储下来：

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()


'''
soup的数据类型是<class 'bs4.BeautifulSoup'>，说明soup是一个BeautifulSoup对象
打印的soup，是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样，却有着不同的内心，它们属于不同的类：<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串，后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本，是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法，所以直接打印 bs 对象显示字符串是str的返回结果
'''

网站中的音乐

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'origin':'https://y.qq.com',
    # 请求来源，本案例中其实是不需要加这个参数的，只是为了演示
    'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',
    # 请求来源，携带的信息比“origin”更丰富，本案例中其实是不需要加这个参数的，只是为了演示
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    # 标记了请求从什么设备，什么浏览器上发出
    }
# 伪装请求头


url1='''
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
'''
url2="p=1"

url3="""
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""
#注意，这个网址是在网页检查过程中找到并复制的，针对文本所在内容的网址，而不是qq音乐的官网。详情参看风变编程笔记。

for i in range(1,10):
    url=url1+"p="+str(i)+url3
    webFile=requests.get(url)
    webFile.encoding="utf-8"
    data=webFile.text

    jsonMusic=webFile.json()
    listMusic = jsonMusic['data']['song']['list']
    
    for i in listMusic:
        print("专辑名：",i["albumname"])
        print("歌曲名：",i["songname"])
        print('播放时长：'+str(i['interval'])+'秒')
        print('播放链接：https://y.qq.com/n/yqq/song/'+i['media_mid']+'.html\n\n')

import requests
from bs4 import BeautifulSoup as bs

myHref="https://y.qq.com/n/yqq/singer/0025NhlN2yWrP4.html"
webFile=requests.get(myHref)
data=webFile.text

soup=bs(data,"html.parser")




print("""class_=js_song""")
items1=soup.find_all(class_="js_song")
count=0
for everyLink in items1:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["href"])

    count+=1
    if count==1:
        break

print()

print("""class_=songlist__songname_txt""")
items2=soup.find_all(class_="songlist__songname_txt")

count=0
for everyLink in items2:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["class"])
    count+=1
    if count==1:
        break
'''    
    if "href" in everyLink.attrs:#属性attrs
            print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
            print(myText,":href=",everyLink.attrs['href'],sep="\t")


    注意，bs提取的信息，class很关键。筛选的东西，之后会形成一个字典。
    如果筛选的范围是链接范围，everyLink.attrs["href"]就会出现链接。
    如果筛选的范围是文本范文，就只能写成everyLink.attrs["class"]
'''


import requests
from bs4 import BeautifulSoup as bs

webURL="""
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
p=1
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""

webFile=requests.get(webURL)
webFile.encoding="utf-8"
data=webFile.text

jsonFile=webFile.json()
##print(type(jsonFile))#<class 'dict'>使用json()方法，将对象转为列表/字典
##for (k,v) in jsonFile.items():
##    print(k)

musicData=jsonFile["data"]#注意，文中是引号-字符串，那么得用引号，如果写成jsonFile[data]是没有用的
##print(type(musicData))
##for (k,v) in musicData.items():
##    print(k)


listMusic=musicData["song"]["list"]
print(type(listMusic))
for music in listMusic:
    print("播放专辑：",music["album"]["name"])
    print('播放时长：'+str(music['interval'])+'秒')  # 查找播放时长
    print('播放链接：https://y.qq.com/n/yqq/song/' +music['mid']+'.html\n\n')
    input()
##
##soup=bs(data,"html.parser")
##print(type(soup))#<class 'bs4.BeautifulSoup'>

import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


sheet1.append(["footer_platform_list__item"])
Tag1=soup.find_all(class_="footer_platform_list__item")
for everyTag in Tag1:
    myText=everyTag.get_text()


    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList1=[myText,i['href']]

            print(myList1)
            
            sheet1.append(myList1)


sheet1.append(["footer_link"])
Tag2=soup.find_all(class_="footer_link")
for everyTag in Tag2:
    myText=everyTag.get_text()
    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList2=[myText,i["href"]]
            print(myList2)
            sheet1.append(myList2)
            
workBook.save("积累文档-QQ音乐网络链接.xlsx")

import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


myClass=['footer_platform_list__item','footer_link','footer_download','footer_copyright','footer_platform',"footer_download"]

for everyClass in myClass:
    print(everyClass)
    
    sheet1.append([everyClass])
    Tag1=soup.find_all(class_=everyClass)

    for everyTag in Tag1:
        myText=everyTag.get_text()
        myLinks=everyTag.find_all("a")

        for i in myLinks:
            if "href" in i.attrs:
                myList1=[myText,i["href"]]
                print(myList1)

                sheet1.append(myList1)

workBook.save("积累文档-QQ音乐链接简练版.xlsx")

网站中的题目


#爬取网站上的题目
from bs4 import BeautifulSoup
import time
import requests


def Pachong():
    for pageNum in range(1,17):
        htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey='+str(pageNum))
        htmlFile.encoding='utf-8'

        
        soup = BeautifulSoup(htmlFile.text,'html.parser')
        print(soup)
        input()
    #Pachong()


    htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey=2')
    htmlFile.encoding='utf-8'
    print(htmlFile)


def PaTi():
    htmlfile=requests.get("http://vers.cqvip.com/view/course/chapter/detail.aspx?cid=125&chapter=%E6%98%8E%E4%BB%A3%E6%96%87%E5%AD%A6")
    htmlfile.encoding='utf-8'
    mysoup=BeautifulSoup(htmlfile.text,'html.parser')

    mycontent1=mysoup.prettify()
    print(mycontent1[:100])

    
    mycontent2=mysoup.smooth()
    print(mycontent2)
    print("OK")

    mycontent3=mysoup.select_one("div")
    print(mycontent3)
    print("Next")
    print()

    
    myinfor=mysoup.find("div").find_all("strong")
    print(myinfor)
    tmp=mysoup.find_next_sibling("div")
    print(tmp)
    
    
#class="q-box"
PaTi()

        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息，便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息，区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分，留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
#Gupiao()

网站中的博客内容

知乎中的文章

import requests
import csv
#引用csv。
csv_file=open('articles.csv','w',newline='',encoding='utf-8')
#调用open()函数打开csv文件，传入参数：文件名“articles.csv”、写入模式“w”、newline=''。
writer = csv.writer(csv_file)
# 用csv.writer()函数创建一个writer对象。
list2=['标题','链接','摘要']
#创建一个列表
writer.writerow(list2)
#调用writer对象的writerow()方法，可以在csv文件里写入一行文字 “标题”和“链接”和"摘要"。

headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?'
offset=0
#设置offset的起始值为0
while True:
    params={
        'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
        'offset':str(offset),
        'limit':'20',
        'sort_by':'voteups',
        }
    #封装参数
    res=requests.get(url,headers=headers,params=params)
    #发送请求，并把响应内容赋值到变量res里面
    articles=res.json()
    print(articles)
    data=articles['data']
    #定位数据
    for i in data:
        list1=[i['title'],i['url'],i['excerpt']]
        #把目标数据封装成一个列表
        writer.writerow(list1)
        #调用writerow()方法，把列表list1的内容写入
    offset=offset+20
    #在while循环内部，offset的值每次增加20
    if offset > 40:
        break
csv_file.close()
#写入完成后，关闭文件就大功告成
print('okay')

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

url2="https://www.zhihu.com/org/jing-ji-ri-bao-xin-wen-ke-hu-duan/posts"
webFile=requests.get(url2,params=params,headers=headers)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")
preData=soup.prettify()



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)

爬取博客

from urllib3 import *
from re import *
http=PoolManager()

#禁止显示警告信息
disable_warnings()

#下载url对应web页面
url="https://www.cnblogs.com/"
result=http.request("GET",url)
htmlStr=result.data.decode("utf-8")
print(htmlStr)


#分析html代码
#通过正则表达式，获取所有关于目标的信息
#<a class="post-item-title" href="https://www.cnblogs.com/hzoi-fengwu/p/14922218.html" target="_blank">STL----vector注意事项</a>

aList=findall('<a[^>] *post-item-title[^>]*>[^<]*</a>',htmlStr)
result=[]

#提取每一个<a后面的url

for a in aList:
    #利用正则表达式提取href后面的url
    g=search('href[\s]*=[\s]*[/"][\]',a)
    if g!=None:
        url=g.group(1)

        #得到url
        print(url)

爬取博客标题-爬虫-正则表达式部分

网站中的词典

#网络爬虫进阶urllib.request
def ilovefish():
    import urllib.request
    myResponse=urllib.request.urlopen("https://ilovefishc.com/")#打开网页，获取信息

    myHtml=myResponse.read()#读出数据
    #print(myHtml)
    myHtml=myHtml.decode("utf-8")#将二进制解码,按照网页信息<head> <meta charset="UTF-8">选择解码格式utf-8
    #print(myHtml)


def placekitten():
    #placekitten.com
    import urllib.request
    myResponse=urllib.request.urlopen("http://placekitten.com/500/600")#打开网页，获取信息

    my_cat_img=myResponse.read()#读出数据
    with open('cat_500_600.jpg','wb') as f:
        f.write(my_cat_img)

def myrequest():
    #urllib.request():This function always returns an object which can work as a context manager and has methods such as
    #geturl() — return the URL of the resource retrieved, commonly used to determine if a redirect was followed
    #info() — return the meta-information of the page, such as headers, in the form of an email.message_from_string() instance (see Quick Reference to HTTP Headers)
    #getcode() – return the HTTP status code of the response.


    import urllib.request
    myresponse=urllib.request.urlopen("http://placekitten.com/300/500")
    myurl=myresponse.geturl()
    print(myurl)

    print(myresponse.info())
    print(myresponse.getcode())

def Cidan():
    #小甲鱼将有道辞典功能提取出程序
    import urllib.request()
    url='http://fanyi.youdao.com/'
    data={}

    my_response=urllib.request.urlopen(url,data)

【心得】