python 网络爬虫

爬虫基本操作

requests-请求 是否成功

积累程序-爬虫 -requests-请求 是否成功.

import requests
response=requests.get("https://www.pku.edu.cn")
print(response.status_code)#用来检查请求是否正确响应,如果状态码是200,代表请求成功。
#4XX,客户端错误,403,禁止访问。5XX,服务器错误,503,服务器不可用。3XX,重定向,305,应使用代理访问。1XX,请求收到。2XX,请求成功。

 将文件写入本地

import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)

或者用如下代码

#【舉例】
import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)


#【舉例】

for i in range(a,a+3):#调试的时候将b换成a+1
    webUrl="https://zh.m.wikisource.org/wiki/春秋左傳正義/卷"+str(i)
    urlList.append(webUrl)

    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"

    data=webFile.text

    myDfile="myDownload"+str(i)+".html"

#第一種
    with open(myDfile,"w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
        file1.write(data)
#第二種,用的是write,而不是print
    wFile=open(myDfile,"w",encoding="utf-8")
    wFile.write(data)
    wFile.close()

观察网站结构

在爬取过程中,需要观察网站的结构。

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
</body>
</html>
"""
 
 
from bs4 import BeautifulSoup

 
soup=BeautifulSoup(html,'html.parser')

#print(soup)
#print(type(soup))#BeautifulSou

tag=soup.find('p')
#print(tag)#Tag
string1=tag.string
#print(string1)#NavigableString
 

soup2=BeautifulSoup("<b><!--Hey--></b>",'html.parser')
comment=soup2.b.string
#print(comment)
#print(type(comment))


soup3=BeautifulSoup('<ad id=123 class="red bule">Hey</ad>','html.parser')
tag=soup3.ad
##print(tag.name)#ad是自己定义的,命名的。
##print(tag.attrs)

##Tag对象类似于HTML文档的标签.
##
##对于标签来说,最重要的就是名字name和属性attrs.


#修改soup的信息
soup=BeautifulSoup('<p id=123 class="red bule">Hey</p>','html.parser')
tag=soup.p
tag.name='a'
tag.attrs['id']=456
tag.attrs['class'][0]='white'
#print(soup)


from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
##print(tag)
##string=tag.string
##print(string)
##print(type(string))
##
##print(string.split('e'))
##
##print(string.lower())


#NavigableString同样可以被直接修改,也可以使用repalce_with的方法来修改.
from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
a='Heloo'
tag.string=a
##print(soup)
##tag.string.replace_with('KO')
##print(soup)

html = """
<div>Total
    <p class="story"> First_p
        <a id="1">El</a>,
        <a id="2">E2</a>,
        <a id="3">E3</a>,
    </p>
    <p>Second_p</p>
</div>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,'html.parser')
#print(soup)
tag=soup.p
#print(tag)
#首先,存在多个标签时,使用标签名称取到的永远是第一个该标签.其余返回的是none
##
##print(len(tag.contents))
##print(tag.contents)


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<div>
<p class="a title"><b>The Dormouse's story</b></p>
<p class="a story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
</div>
<div>
<p class="st">Last<p class="st">......</p></p>
</div>
"""
 
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#标签就是HTML的标签.比如搜索html文档中的所有a标签.
#print(soup.find_all('a'))


#可以接受正则表达式作为过滤,比如所有名称中包含'a'的标签.
print()
import  re 
#print(soup.find_all(re.compile('a')))
##
##
##列表中所包含的元素都将作为过滤标准,比如搜索所有的a标签和b标签.
#print(soup.find_all(['a','b']))
#print()



#print(soup.find_all('p')[1].find_all(True))
##输出
##[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
## <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
## <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


def filter(tag):
    return tag['id']=='link2'

#print(soup.find_all('p')[1])
 
##print(soup.find_all('p')[1].find_all(filter))
##
##print(soup.find_all('p')[1].find_all(filter))
#输出:
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

print(soup.select('.st'))

#BeautifulSoup提供了一个prettify()方法可以对不完整或者不规范的HTML文档进行规整.



改变后面的备注名。htm,或者txt,就可以改变格式。很神奇。
如何创建htm?网上可搜。创建一个网站,可以用网络打开。

电脑信息字段

有时候,需要一个电脑信息字段

import requests
import csv


headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',

    'limit': '20',
    'sort_by': 'created'
}

beautiful soup的使用

Help on class BeautifulSoup in module bs4:

class BeautifulSoup(bs4.element.Tag)
 |  BeautifulSoup(markup='', features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs)
 |
 |  A data structure representing a parsed HTML or XML document.
 |
 |  Most of the methods you'll call on a BeautifulSoup object are inherited from
 |  PageElement or Tag.
 |
 |  Internally, this class defines the basic interface called by the
 |  tree builders when converting an HTML/XML document into a data
 |  structure. The interface abstracts away the differences between
 |  parsers. To write a new tree builder, you'll need to understand
 |  these methods as a whole.
 |
 |  These methods will be called by the BeautifulSoup constructor:
 |    * reset()
 |    * feed(markup)
 |
 |  The tree builder may call these methods from its feed() implementation:
 |    * handle_starttag(name, attrs) # See note about return value
 |    * handle_endtag(name)
 |    * handle_data(data) # Appends to the current data node
 |    * endData(containerClass) # Ends the current data node
 |
 |  No matter how complicated the underlying parser is, you should be
 |  able to build a tree using 'start tag' events, 'end tag' events,
 |  'data' events, and "done with data" events.
 |
 |  If you encounter an empty-element tag (aka a self-closing tag,
 |  like HTML's <br> tag), call handle_starttag and then
 |  handle_endtag.
 |
 |  Method resolution order:
 |      BeautifulSoup
 |      bs4.element.Tag
 |      bs4.element.PageElement
 |      builtins.object
 |
 |  Methods defined here:
 |
 |  __copy__(self)
 |      Copy a BeautifulSoup object by converting the document to a string and parsing it again.
 |
 |  __getstate__(self)
 |
 |      Constructor.
 |
 |      :param markup: A string or a file-like object representing
 |       markup to be parsed.
 |
 |      :param features: Desirable features of the parser to be
 |       used. This may be the name of a specific parser ("lxml",
 |       "lxml-xml", "html.parser", or "html5lib") or it may be the
 |       type of markup to be used ("html", "html5", "xml"). It's
 |       recommended that you name a specific parser, so that
 |       Beautiful Soup gives you the same results across platforms
 |       and virtual environments.
 |
 |      :param builder: A TreeBuilder subclass to instantiate (or
 |       instance to use) instead of looking one up based on
 |       `features`. You only need to use this if you've implemented a
 |       custom TreeBuilder.
 |
 |      :param parse_only: A SoupStrainer. Only parts of the document
 |       matching the SoupStrainer will be considered. This is useful
 |       when parsing part of a document that would otherwise be too
 |       large to fit into memory.
 |
 |      :param from_encoding: A string indicating the encoding of the
 |       document to be parsed. Pass this in if Beautiful Soup is
 |       guessing wrongly about the document's encoding.
 |
 |      :param exclude_encodings: A list of strings indicating
 |       encodings known to be wrong. Pass this in if you don't know
 |       the document's encoding but you know Beautiful Soup's guess is
 |       wrong.
 |
 |      :param element_classes: A dictionary mapping BeautifulSoup
 |       classes like Tag and NavigableString, to other classes you'd
 |       like to be instantiated instead as the parse tree is
 |       built. This is useful for subclassing Tag or NavigableString
 |       to modify default behavior.
 |
 |      :param kwargs: For backwards compatibility purposes, the
 |       constructor accepts certain keyword arguments used in
 |       Beautiful Soup 3. None of these arguments do anything in
 |       Beautiful Soup 4; they will result in a warning and then be
 |       ignored.
 |
 |       Apart from this, any keyword arguments passed into the
 |       BeautifulSoup constructor are propagated to the TreeBuilder
 |       constructor. This makes it possible to configure a
 |       TreeBuilder by passing in arguments, not just by saying which
 |       one to use.
 |
 |  decode(self, pretty_print=False, eventual_encoding='utf-8', formatter='minimal')
 |      Returns a string or Unicode representation of the parse tree
 |          as an HTML or XML document.
 |
 |      :param pretty_print: If this is True, indentation will be used to
 |          make the document more readable.
 |      :param eventual_encoding: The encoding of the final document.
 |          If this is None, the document will be a Unicode string.
 |
 |  endData(self, containerClass=None)
 |      Method called by the TreeBuilder when the end of a data segment
 |      occurs.
 |
 |  handle_data(self, data)
 |      Called by the tree builder when a chunk of textual data is encountered.
 |
 |  handle_endtag(self, name, nsprefix=None)
 |      Called by the tree builder when an ending tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |
 |  handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None)
 |      Called by the tree builder when a new tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |      :param attrs: A dictionary of attribute values.
 |      :param sourceline: The line number where this tag was found in its
 |          source document.
 |      :param sourcepos: The character position within `sourceline` where this
 |          tag was found.
 |
 |      If this method returns None, the tag was rejected by an active
 |      SoupStrainer. You should proceed as if the tag had not occurred
 |      in the document. For instance, if this was a self-closing tag,
 |      don't call handle_endtag.
 |
 |  insert_after(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  insert_before(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  new_string(self, s, subclass=None)
 |      Create a new NavigableString associated with this BeautifulSoup
 |      object.
 |
 |  new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs)
 |      Create a new Tag associated with this BeautifulSoup object.
 |
 |  object_was_parsed(self, o, parent=None, most_recent_element=None)
 |      Method called by the TreeBuilder to integrate an object into the parse tree.
 |
 |  popTag(self)
 |      Internal method called by _popToTag when a tag is closed.
 |
 |  pushTag(self, tag)
 |      Internal method called by handle_starttag when a tag is opened.
 |
 |  reset(self)
 |      Reset this object to a state as though it had never parsed any
 |      markup.
 |
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |
 |  ASCII_SPACES = ' \n\t\x0c\r'
 |
 |  DEFAULT_BUILDER_FEATURES = ['html', 'fast']
 |
 |  NO_PARSER_SPECIFIED_WARNING = 'No parser was explicitly specified, so ...
 |
 |  ROOT_TAG_NAME = '[document]'
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.Tag:
 |
 |  __bool__(self)
 |      A tag is non-None even if it has no contents.
 |
 |  __call__(self, *args, **kwargs)
 |      Calling a Tag like a function is the same as calling its
 |      find_all() method. Eg. tag('a') returns a list of all the A tags
 |      found within this tag.
 |
 |  __contains__(self, x)
 |
 |  __delitem__(self, key)
 |      Deleting tag[key] deletes all 'key' attributes for the tag.
 |
 |  __eq__(self, other)
 |      Returns true iff this Tag has the same name, the same attributes,
 |      and the same contents (recursively) as `other`.
 |
 |  __getattr__(self, tag)
 |      Calling tag.subtag is the same as calling tag.find(name="subtag")
 |
 |  __getitem__(self, key)
 |      tag[key] returns the value of the 'key' attribute for the Tag,
 |      and throws an exception if it's not there.
 |
 |  __hash__(self)
 |      Return hash(self).
 |
 |  __iter__(self)
 |      Iterating over a Tag iterates over its contents.
 |
 |  __len__(self)
 |      The length of a Tag is the length of its list of contents.
 |
 |  __ne__(self, other)
 |      Returns true iff this Tag is not identical to `other`,
 |      as defined in __eq__.
 |
 |  __repr__ = __unicode__(self)
 |
 |  __setitem__(self, key, value)
 |      Setting tag[key] sets the value of the 'key' attribute for the
 |      tag.
 |
 |  __str__ = __unicode__(self)
 |
 |  __unicode__(self)
 |      Renders this PageElement as a Unicode string.
 |
 |  childGenerator(self)
 |      Deprecated generator.
 |
 |  clear(self, decompose=False)
 |      Wipe out all children of this PageElement by calling extract()
 |         on them.
 |
 |      :param decompose: If this is True, decompose() (a more
 |          destructive method) will be called instead of extract().
 |
 |  decode_contents(self, indent_level=None, eventual_encoding='utf-8', formatter='minimal')
 |      Renders the contents of this tag as a Unicode string.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The tag is destined to be
 |         encoded into this encoding. decode_contents() is _not_
 |         responsible for performing that encoding. This information
 |         is passed in so that it can be substituted in if the
 |         document contains a <META> tag that mentions the document's
 |         encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |  decompose(self)
 |      Recursively destroys this PageElement and its children.
 |
 |      This element will be removed from the tree and wiped out; so
 |      will everything beneath it.
 |
 |  encode(self, encoding='utf-8', indent_level=None, formatter='minimal', errors='xmlcharrefreplace')
 |      Render a bytestring representation of this PageElement and its
 |      contents.
 |
 |      :param encoding: The destination encoding.
 |      :param indent_level: Each line of the rendering will be
 |          indented this many spaces. Used internally in
 |          recursive calls while pretty-printing.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :param errors: An error handling strategy such as
 |          'xmlcharrefreplace'. This value is passed along into
 |          encode() and its value should be one of the constants
 |          defined by Python.
 |      :return: A bytestring.
 |
 |  encode_contents(self, indent_level=None, encoding='utf-8', formatter='minimal')
 |      Renders the contents of this PageElement as a bytestring.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The bytestring will be in this encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |      :return: A bytestring.
 |
 |  find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |      Look in the children of this PageElement and find the first
 |      PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  findAll = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  findChild = find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |
 |  findChildren = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |      Look in the children of this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find_all() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  get(self, key, default=None)
 |      Returns the value of the 'key' attribute for the tag, or
 |      the value given for 'default' if it doesn't have that
 |      attribute.
 |
 |
 |  get_attribute_list(self, key, default=None)
 |      The same as get(), but always returns a list.
 |
 |      :param key: The attribute to look for.
 |      :param default: Use this value if the attribute is not present
 |          on this PageElement.
 |      :return: A list of values, probably containing only a single
 |          value.
 |
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  has_attr(self, key)
 |      Does this PageElement have an attribute with the given name?
 |
 |  has_key(self, key)
 |      Deprecated method. This was kind of misleading because has_key()
 |      (attributes) was different from __in__ (contents).
 |
 |      has_key() is gone in Python 3, anyway.
 |
 |  index(self, element)
 |      Find the index of a child by identity, not value.
 |
 |      Avoids issues with tag.contents.index(element) getting the
 |      index of equal elements.
 |
 |      :param element: Look for this PageElement in `self.contents`.
 |
 |  prettify(self, encoding=None, formatter='minimal')
 |      Pretty-print this PageElement as a string.
 |
 |      :param encoding: The eventual encoding of the string. If this is None,
 |          a Unicode string will be returned.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :return: A Unicode string (if encoding==None) or a bytestring
 |          (otherwise).
 |
 |  recursiveChildGenerator(self)
 |      Deprecated generator.
 |
 |  renderContents(self, encoding='utf-8', prettyPrint=False, indentLevel=0)
 |      Deprecated method for BS3 compatibility.
 |
 |  select(self, selector, namespaces=None, limit=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      This uses the SoupSieve library.
 |
 |      :param selector: A string containing a CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param limit: After finding this number of results, stop looking.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  select_one(self, selector, namespaces=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      :param selector: A CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  smooth(self)
 |      Smooth out this element's children by consolidating consecutive
 |      strings.
 |
 |      This makes pretty-printed output look more natural following a
 |      lot of operations that modified the tree.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.Tag:
 |
 |  children
 |      Iterate over all direct children of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  descendants
 |      Iterate over all children of this PageElement in a
 |      breadth-first sequence.
 |
 |      :yield: A sequence of PageElements.
 |
 |  isSelfClosing
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  is_empty_element
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  strings
 |      Yield all strings of certain classes, possibly stripping them.
 |
 |      :param strip: If True, all strings will be stripped before being
 |          yielded.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :yield: A sequence of strings.
 |
 |  stripped_strings
 |      Yield all strings in the document, stripping them first.
 |
 |      :yield: A sequence of stripped strings.
 |
 |  text
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.Tag:
 |
 |  parserClass
 |
 |  string
 |      Convenience property to get the single string within this
 |      PageElement.
 |
 |      TODO It might make sense to have NavigableString.string return
 |      itself.
 |
 |      :return: If this element has a single string child, return
 |       value is that string. If this element has one child tag,
 |       return value is the 'string' attribute of the child tag,
 |       recursively. If this element is itself a string, has no
 |       children, or has more than one child, return value is None.
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.PageElement:
 |
 |  append(self, tag)
 |      Appends the given PageElement to the contents of this one.
 |
 |      :param tag: A PageElement.
 |
 |  extend(self, tags)
 |      Appends the given PageElements to this one's contents.
 |
 |      :param tags: A list of PageElements.
 |
 |  extract(self)
 |      Destructively rips this element out of the tree.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  fetchNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, li
mit=None, **kwargs)
 |
 |  fetchParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  fetchPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit=N
one, **kwargs)
 |
 |  fetchPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=
None, limit=None, **kwargs)
 |
 |  findAllNext = find_all_next(self, name=None, attrs={}, text=None, limit=None, *
*kwargs)
 |
 |  findAllPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit
=None, **kwargs)
 |
 |  findNext = find_next(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findNextSibling = find_next_sibling(self, name=None, attrs={}, text=None, **kwa
rgs)
 |
 |  findNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, lim
it=None, **kwargs)
 |
 |  findParent = find_parent(self, name=None, attrs={}, **kwargs)
 |
 |  findParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  findPrevious = find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findPreviousSibling = find_previous_sibling(self, name=None, attrs={}, text=Non
e, **kwargs)
 |
 |  findPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=N
one, limit=None, **kwargs)
 |
 |  find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all PageElements that match the given criteria and appear
 |      later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet containing PageElements.
 |
 |  find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Look backwards in the document from this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_next(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the first PageElement that matches the given criteria and
 |      appears later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the closest sibling to this PageElement that matches the
 |      given criteria and appears later in the document.
 |
 |      All find_* methods take a common set of arguments. See the
 |      online documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all siblings of this PageElement that match the given criteria
 |      and appear later in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_parent(self, name=None, attrs={}, **kwargs)
 |      Find the closest parent of this PageElement that matches the given
 |      criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |      Find all parents of this PageElement that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |      Look backwards in the document from this PageElement and find the
 |      first PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Returns the closest sibling to this PageElement that matches the
 |      given criteria and appears earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwar
gs)
 |      Returns all siblings to this PageElement that match the
 |      given criteria and appear earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  format_string(self, s, formatter)
 |      Format the given string using the given formatter.
 |
 |      :param s: A string.
 |      :param formatter: A Formatter object, or a string naming one of the standar
d formatters.
 |
 |  formatter_for_name(self, formatter)
 |      Look up or create a Formatter for the given identifier,
 |      if necessary.
 |
 |      :param formatter: Can be a Formatter object (used as-is), a
 |          function (used as the entity substitution hook for an
 |          XMLFormatter or HTMLFormatter), or a string (used to look
 |          up an XMLFormatter or HTMLFormatter in the appropriate
 |          registry.
 |
 |  insert(self, position, new_child)
 |      Insert a new PageElement in the list of this PageElement's children.
 |
 |      This works the same way as `list.insert`.
 |
 |      :param position: The numeric position that should be occupied
 |         in `self.children` by the new PageElement.
 |      :param new_child: A PageElement.
 |
 |  nextGenerator(self)
 |      # Old non-property versions of the generators, for backwards
 |      # compatibility with BS3.
 |
 |  nextSiblingGenerator(self)
 |
 |  parentGenerator(self)
 |
 |  previousGenerator(self)
 |
 |  previousSiblingGenerator(self)
 |
 |  replaceWith = replace_with(self, replace_with)
 |
 |  replaceWithChildren = unwrap(self)
 |
 |  replace_with(self, replace_with)
 |      Replace this PageElement with another one, keeping the rest of the
 |      tree the same.
 |
 |      :param replace_with: A PageElement.
 |      :return: `self`, no longer part of the tree.
 |
 |  replace_with_children = unwrap(self)
 |
 |  setup(self, parent=None, previous_element=None, next_element=None, previous_sib
ling=None, next_sibling=None)
 |      Sets up the initial relations between this element and
 |      other elements.
 |
 |      :param parent: The parent of this element.
 |
 |      :param previous_element: The element parsed immediately before
 |          this one.
 |
 |      :param next_element: The element parsed immediately before
 |          this one.
 |
 |      :param previous_sibling: The most recently encountered element
 |          on the same level of the parse tree as this one.
 |
 |      :param previous_sibling: The next element to be encountered
 |          on the same level of the parse tree as this one.
 |
 |  unwrap(self)
 |      Replace this PageElement with its contents.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  wrap(self, wrap_inside)
 |      Wrap this PageElement inside another one.
 |
 |      :param wrap_inside: A PageElement.
 |      :return: `wrap_inside`, occupying the position in the tree that used
 |         to be occupied by `self`, and with `self` inside it.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.PageElement:
 |
 |  next
 |      The PageElement, if any, that was parsed just after this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  next_elements
 |      All PageElements that were parsed after this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  next_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      later.
 |
 |      :yield: A sequence of PageElements.
 |
 |  parents
 |      All PageElements that are parents of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous
 |      The PageElement, if any, that was parsed just before this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  previous_elements
 |      All PageElements that were parsed before this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      earlier.
 |
 |      :yield: A sequence of PageElements.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.PageElement:
 |
 |  __dict__
 |      dictionary for instance variables (if defined)
 |
 |  __weakref__
 |      list of weak references to the object (if defined)
 |
 |  nextSibling
 |
 |  previousSibling

>>>

beautiful-find_all用法

积累程序-爬虫-beautiful-find_all用法

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
##
##items=soup.find_all(class_="h")
##for i in items:
##    print(i)



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)



request-beautifulsoup区别

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()


'''
soup的数据类型是<class 'bs4.BeautifulSoup'>,说明soup是一个BeautifulSoup对象
打印的soup,是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样,却有着不同的内心,它们属于不同的类:<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串,后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本,是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法,所以直接打印 bs 对象显示字符串是str的返回结果
'''

获取网页中文字并有序展现

爬虫-bs-获取北大网页中的网站和文字-并有序展现

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象


items2=soup.find_all(class_="item")

##for iTag in items2:
##    for i in iTag.find_all():
##        myText=i.get_text()
##        print(myText)
##

for everyTag in items2:
    print(everyTag)

    print()
    print("文字部分")
    myText=everyTag.get_text()
    print(myText)

    print()
    print("链接部分")
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink)
    input()


















    

global

 变量作用域
 一个在函数内部赋值的变量仅能在该函数内部使用(局部作用域),它们被称作局部变量
 在所有函数之外赋值的变量,可以在程序的任何位置使用(全局作用域),它们被称作全局变量
如果想将局部变量声明为全局变量,就要用到global语句

tfc = 1000

def tvc():
    global tvc  # global语句一般写在函数体的第一行,它会告诉Python,“我希望tvc是个全局变量,所以请不要用这个名字创建一个局部变量”
    vc = 200
    x = 10
    tvc = vc * x

def tc():
    print(tfc+tvc)  # tc()函数内部现在可以直接使用声明后的全局变量tvc

tvc()
tc()
# 》》3000


 

match

import re
m=re.match("hello","hellov world")
if m is not None:
    print(m.group())
    
print(m.__class__.__name__)


m=re.match("bird","bird is flying")
print(m.group())

使用soup.prettify() 有序呈现

import requests
import csv
from bs4 import BeautifulSoup as bs

url="https://www.zhihu.com/follow"

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

webFile= requests.get(url, params=params, headers=headers)
webFile.encoding="utf-8"
data=webFile.text   

soup=bs(data,"html.parser")
print(soup.prettify())

爬取标签

从网页中爬取标签

从超星、维基、知网、阿帕比网站,Ctrl + S 保存网页后,爬取其中的文本目录信息。可以用如下代码实现操作。

myWord="""
[Images]
[Font]
Language=GBK
FontSize=7
Margin=0.5

[Bkmk]
File=FreePic2Pdf_bkmk.txt
AddAsText=0
ShowBkmk=1
ShowAll=1
BasePage=1

[Main]
ContentsPage=
TextPage=
"""
Head='''
首
\t书名页
\t版权页
\t序言
目录

'''

def test():
    htmlName=str(input("请输入网页Wiki CNKI ChoaXing Apabi文件名称:"))

    import requests
    from bs4 import BeautifulSoup as bs

    webFile=open(htmlName,"r",encoding="utf-8")
    data=webFile.read()
    webFile.close()

    mysoup=bs(data,"html.parser")
    mysoup.prettify()

    writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")

    print(Head,file=writeFile)

                 
    if "维基文库" in htmlName:
        print("Wiki")
        result=mysoup.find_all("li")
        choice=input("请选择通行A 或 调试T:")
        for i in result:
            myInfo=i.get_text()

            if choice=="A":
                if "卷" in myInfo:
                    mylist=myInfo.split(" ")
                    print(mylist[0],file=writeFile)
                    for m in mylist[1:]:
                        print("\t",m,file=writeFile)
            elif choice=="T":
                if "卷" in myInfo:
                    print(myInfo,file=writeFile)
                else:
                    print("\t",myInfo,file=writeFile)
    elif "阿帕比" in htmlName:
        print("Apabi")
        result=mysoup.find_all("li")
        for i in result:
            myInfo=i.get_text()
            for word in "()1234567890页":
                myInfo=myInfo.replace(word,"")

            infoList=myInfo.split(" ")
            if len(infoList)>2:#将单个的对象排除。统一切分处理
                print(infoList[1],file=writeFile)
                for m in infoList[2:]:
                        print("\t",m,file=writeFile)
            elif len(infoList)==2:
                print("\t",myInfo,file=writeFile)
                
        
    elif "中国知网" in htmlName or "CNKI" in htmlName:
        print("CNKI")
        result=mysoup.find_all(attrs={"class":"catalog-listDiv"})
        if len(result)==0:
            result=mysoup.find_all("li")
        
        for i in result:
            myInfo=i.get_text()
            infoline=myInfo.split("    ")
            for line in infoline:
                if "摘要" in line:
                    nline=line.split(" ")
                    for m in nline:
                        print(m,file=writeFile)
                elif "第" in line and  "章" in line and "节" not in line:
                    wline=line.split(" ")
                    print("\t",wline[0],file=writeFile)
                    for m in wline[1:]:
                        print(m,end="",file=writeFile)
                    print("\n",file=writeFile)

     
                elif "结语 参考文献 致谢" in line:
                    nline=line.split(" ")
                    print(nline[0]+nline[1],file=writeFile)
                    for m in nline[2:]:
                        print(m,file=writeFile)
                else:print("\t",line,file=writeFile)

    else:
        print("ChaoXing")
        result=mysoup.find_all("span")

        for i in result:
            if "node_name" in str(i):
                sen=i.get_text()
                sen=sen.lstrip(" ")
                
                if  "第" in str(i) and "章" in str(i):
                    print(sen,file=writeFile)
                elif  "第" in str(i) and "讲" in str(i):
                    print(sen,file=writeFile)
                elif "卷" in str(i) or "论" in str(i) or "编" in str(i):
                    for hz in "一二三四五六七八九十":
                        if hz in str(i):
                            print(sen,file=writeFile)
                            break
                    else:print("\t",sen,file=writeFile)

                else:
                    print("\t",sen,file=writeFile)


    print("尾",file=writeFile)
    writeFile.close()

    itfFile=open("FreePic2Pdf.itf","w",encoding="utf-8")
    print(myWord,file=itfFile)
    itfFile.close()



即可完成。

爬取网页步骤解析

ebook总库https://ebook.dswxyjy.org.cn/)中的电子文献为例,对过程进行解读。

《建国以来重要文献选编(第一册)

登录该网站,点击相关按钮,得到该电子书的目录。

然后ctrl+S保存到电脑上。下面写代码,爬取所保存网络文件中的文本信息。

htmlName="建国以来重要文献选编(第一册).html"

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


mysoup=bs(data,"html.parser")
mysoup.prettify()

print(mysoup.prettify())

运行如下代码,在CMD中运行得到的效果如下(部分):

left: 56178px;">
       <div class="item left" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); left: 0px; right: auto;">
        <img style='background-color: rgb(255, 255, 255); background-image: url("style/icon/loading.gif"); background-repeat: no-repeat; background-position: center center; width: 118px; height: 170px;'/>
        <p class="title" style="display: none;">
         454
        </p>
       </div>
       <div class="item right" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); right: 0px; left: auto;">
        <img style='background-color: rgb(255, 255, 255); background-image: url("style/icon/loading.gif"); background-repeat: no-repeat; background-position: center center; width: 118px; height: 170px;'/>

这个时候,发现所需要的目录文本没有出现。

但是,如果用记事本打开下载的html文件,就会发现这些目录文本存在。如下所示:

<li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: rgba(255, 255, 255, 0.1); font-weight: bold;"><img class="arrow" src="" style="left: 0px;"><p class="description" style="margin-left: 15px;">《建国以来重要文献选编》(第一册)</p></li><ul class="itemList"><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中国人民政治协商会议共同纲领(一九四九年九月二十九日中国人民政治协商会议第一届全体会议通过)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">人民政协共同纲领草案的特点(一九四九年九月二十二日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中华人民共和国中央人民政府公告(一九四九年十月一日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中国人民解放军总部命令(一九四九年十月一日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">中共中央关于少数民族“自决权”问题给二野前委的指示(一九四九年十月五日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">新华总社关于土改后农村阶级划分问题给东北总分社的复电(一九四九年十月十一日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">关于学习松江县召开各界人民代表会议经验的指示(一九四九年十月十三日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); background: transparent;"><p class="description" style="margin-left: 25px;">毛泽东批转薄一波《关于华北各城市召开各界代表会议的情形和经验的报告》(一九四九年十月三十日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><p class="description" style="margin-left: 25px;">制止物价猛涨(一九四九年十一月十三日)</p></li><li class="item" style="touch-action: manipulation; user-select: none; -webkit-user-drag: none; -webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><p class="description" style="margin-left: 25px;">关于大量吸收和培养少数民族干部的指示(一九四九年十一月十四日)</p></li>

针对这种情况,需要重新想新的策略。

实际上,依据打开的txt文本,发现各个标题是在如下的形式中:

<p class="description" style="margin-left: 25px;">中央人民政府委员会关于发行人民胜利折实公债的决定

所以,执行如下代码:mysoup.find_all(attrs={"class":"description"})

htmlName="建国以来重要文献选编(第一册).html"

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()



mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo)

这个时候,就会把这些文本信息给提取出来。经过调试,完整代码如下:

htmlName="建国以来重要文献选编(第四册).html"

Head='''首\n\t书名页\n\t版权页\n\t序言\n目录'''

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
print(Head,file=writeFile)

mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result[1:]:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo,file=writeFile)

writeFile.close()

最后,形成的完整代码如下:

#myexample="建国以来重要文献选编(第四册).html"

htmlName=input("请输入社科院ebook网页文件名称:")
Head='''首\n\t书名页\n\t版权页\n\t序言\n目录'''

import requests
from bs4 import BeautifulSoup as bs

webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()


writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
print(Head,file=writeFile)

mysoup=bs(data,"html.parser")
mysoup.prettify()

result=mysoup.find_all(attrs={"class":"description"})#提取出文本所在的信息块

for i in result[1:]:
    myInfo=i.get_text()#从信息块中提取出文本信息
    print(myInfo,file=writeFile)

writeFile.close()
print("已输出完毕。")




如此,即可完成爬取任务。

从文本中获取标签

可以使用如下代码:

def test():
    import re
    pattern="“.*?[。?:;”]"

    fileName=input("选择句子开头作为标签,请输入文本名称:")#说文解字,尔雅

    part=input("请输入1或2个区分层级关键词{第部章卷...}:")
    if len(part)==1:
        a=part
        b=part
    elif len(part)==2:
        a=part[0]
        b=part[1]
    choice="L"
    choice=input("文本对话选L;Wiki目录选W;开头首字母选S;开头前面句子选E:")
    choice=choice.upper()

    file=open(fileName,"r",encoding="utf-8")
    data=file.read()
    file.close()

    data=data.replace("编辑","")
    datalines=data.splitlines()

    def ShuoWen():
        #说文
        for line in datalines:
            for word in line:
                if word in "(( )0123456789:↑":
                    break
            print("\t",word,file=wfile)
    def ErYa():
        for line in datalines:
            if part in line:
                print(line,file=wfile)
            else:print("\t",line[:5],file=wfile)
    def Wiki():
        for line in datalines:
            if part in line and len(line)<=4 and len(line)>=2:
                print(line,file=wfile)
            elif "↑" in line or "◄" in line or "►" in line or " 註釋"  in line:pass
            elif len(line)>=2 and len(line)<=10:
                print("\t",line,file=wfile)            
    def LunYu():
        zhang=0
        
        jieming=0
        for line in datalines:
            if a in line and b in line:
                print(line,file=wfile)
                zhang+=1
                jieming=1

            if a not in line and b not in line and len(line)>4:#【经验】if ...if...和if ... else...不同。前者是单线,后者是双线。
                result=re.compile(pattern).findall(line)
                print("\t",f"{zhang}.{jieming}",end="",file=wfile)

                if len(result)!=0:#选择引号内的句子。
                    jieming+=1
                    n=0
                    for i in result:
                        i=i.lstrip("“")
                        print(i,file=wfile)
                        n+=1
                        if n==1:
                           break                    
                else:#没有引号则选择开头句子
                    jieming+=1
                    for w in line:
                        print(w,end="",file=wfile)
                        if w in ":。;":
                            break
                print("\n",file=wfile)
            
            

    wfile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
    if choice=="S":
        ShuoWen()
    elif choice=="E":
        ErYa()
    elif choice=="W":
        Wiki()
    elif choice=="L":
        LunYu()
    wfile.close()
    print("已经完成")

即可实现。

爬取文本

爬虫实践从wiki中下载文本

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)

    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息,则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()

    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时:","{0:4.2}".format(long),"分钟。")

对于代码,还可以进一步优化。

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]#需要依据实际情况调整章节的网络链接格式
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)#有的网站是“卷01”,不按照链接体现的格式。这个就得调整程序了。
    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息,则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()


    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时:","{0:4.2}".format(long),"分钟。")
test()


爬虫实践从zdic中下载文本

import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re 

def test():
    a=int(input("请输入汉典网页起始页码:"))
    b=int(input("请输入汉典网页终止页码:"))

    myName=input("请输入目标文件名:")

    startTime=time.time()
    HouZhui=".docx"
    resultName=myName+HouZhui




    urlList=[]

    for i in range(a,b+1):
        webUrl="https://gj.zdic.net/archive.php?aid-"+str(i)+".html"
        urlList.append(webUrl)

    zongShu=len(urlList)
    n=1


    writeFile=open(resultName,"w",encoding="utf-8")

    for webUrl in urlList:
        webfile=requests.get(webUrl)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print("\n",file=writeFile)
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(attrs={"id":"snr2"})
        art=str(result)
        artlines=art.splitlines()
        article=artlines[0][17:]
        article=article.replace("<br/>","s")
        for i in article:
            
            if i=="s":
                print("\n",file=writeFile)
                print("\t",file=writeFile)
            else:print(i,end="",sep="",file=writeFile)
        print("……",file=writeFile)
        print("\n",file=writeFile)
        time.sleep(0.05+random.randint(0,2))
        percent=float(n/zongShu)
        print(f"第{n}页已完成,共计{zongShu}页,完成度","{0:4.2}".format(percent))
        n+=1
    writeFile.close()
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时:","{0:4.2}".format(long),"分钟。")

爬虫实践 从ctext 中下载文本

从ctext 中下载文本。可以用到ctext包。

ctext 相关说明如下

https://pypi.org/project/ctext/

下面以《论语》为例,说明如何下载。

代码如下:

from ctext import *
setapikey("your-api-key-goes-here")
setlanguage("zh")

stats = getstats()
status = getstatus()
titles = gettexttitles()
capabilities = getcapabilities()

urn = readlink("https://ctext.org/analects")#以论语为例

passages = gettext("ctp:analects/xue-er")
print(passages)




又有如下程序,亦可以实现功能。

def test():
    '''
    https://ctext.org/wiki.pl?if=gb&chapter=868712
    https://ctext.org/wiki.pl?if=gb&chapter=969206
    webUrl="https://ctext.org/wiki.pl?if=gb&res=970278
    '''
    import requests
    from bs4 import BeautifulSoup as bs

    import time
    import random

    headers={}#建立字典
    user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
                    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
                    ]



    startTime=time.time()
    webUrl=input("请输入书本在Ctext中目录所在网址:")#目录页所在编码,可以获得每章的链接
    ##webUrl="https://ctext.org/wiki.pl?if=gb&res=642006"
    startPage=int(input("请输入目录列表中所求链接的序列数字:"))
    webfile=requests.get(webUrl)
    webfile.encoding="utf-8"
    data=webfile.text

    obs=bs(data,"html.parser")
    obs.prettify()


    result=obs.find_all("a")
    Name=obs.h2
    nameStr=Name.get_text()
    nameList=nameStr.split("[")
    resultName=nameList[0]




    urlList=[]
    for i in result:
        if "wiki.pl?" and "卷" in str(i):
            url=list(i.attrs.values())

            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)
        elif "wiki.pl?" and "序" in str(i):
            url=list(i.attrs.values())
            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)


    numList=[str(i) for i in range(0,10)]

    zongShu=len(urlList)


    n=0
    writeFile=open(f"{resultName}_FromCtext.txt","a+",encoding="utf-8")

    start=startPage-1
    for webUrl in urlList[start:]:#列表从0开始
        headers['User-Agent']= random.choice(user_agent_list)

        print(webUrl)
        webfile=requests.get(webUrl,headers=headers)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(class_="ctext")
        for i in result:
            myStr=i.get_text()
            for num in numList:
                myStr=myStr.replace(num,"")
            print(myStr,file=writeFile)
        n+=1
            

        time.sleep(3+random.randint(0,3))
        percent=float((n+start)/zongShu)
        print(f"第{n+start}页已完成,共计{zongShu}页,完成度","{0:4.2}".format(percent))

        
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时:","{0:4.2}".format(long),"分钟。")
    writeFile.close()


三者结合

想要使得三者结合,有如下代码:

'''
https://gj.zdic.net/archive.php?aid-6679.html


'''
def test():
    webChoice=input("汉典:Z;维基:W;哲学电子书:C。请输入选择:")
    webChoice=webChoice.upper()
    if webChoice=="Z":
        import BsFromZdic
        BsFromZdic.test()
    elif webChoice=="W":
        import BsFromWik
        BsFromWik.test()
    elif webChoice=="C":
        import BsFromCtext
        BsFromCtext.test()

爬取图片

贴吧中的动画图片

《虹猫蓝兔七侠传》是一部非常不错的动画片,后续还有漫画版的前传和后传。百度贴吧中,有这样一系列图片,现在想把图片爬下来,合成PDF便于阅读。写如下代码:

'''
<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=fbff
fefc1f950a7b75354ecc3ad3625c/4c5fa44bd11373f035f5ca55a60f4bfbf9ed04ca.jpg" pic_ext="jpeg" pic_type="0"
width="560" height="388">

<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=efda23249e82d158bb8259b9b00
819d5/acb1c2ef76094b36927cbe27a1cc7cd98c109d2e.jpg"
pic_ext="jpeg" pic_type="0" width="560" height="426">

<img class="image_original_original"
style="z-index: 2; width: 585.176px; height: 450px; top: 0px; left: 75.9122px;"
src="http://imgsrc.baidu.com/forum/pic/item/f82405d7912397dd928f3cce5b82b2b7d1a28726.jpg">

'''



urlList=["http://tieba.baidu.com/p/3175345087",
         "http://tieba.baidu.com/p/3175362317",
         "http://tieba.baidu.com/p/3175373350",
         "http://tieba.baidu.com/p/3175383386",
        "http://tieba.baidu.com/p/3175393635",
        "http://tieba.baidu.com/p/3175402697",]

import urllib.request
import re

zhang=1
for webUrl in urlList:

    i=1
    htmll=urllib.request.urlopen(webUrl).read()

    data=str(htmll)


    pattern='''img class="image_original_original" src=.(.+?\.jpg)"'''
    result=re.compile(pattern).findall(data)

    for imageUrl in result:
        print(imageUrl)
##        print(imageUrl)
##        imageName=str(zhang)+"-"+str(i)+".jpg"
##        i=i+1
##        urllib.request.urlretrieve(imageUrl,filename=imageName)
##    zhang=zhang+1
##    
    print()

注意:爬取后的图片,有模糊图和高清图两种,名称并不一样。 高清图的位置不在页面中而是在别处,需要修正一下网络链接才能爬取。

那么,能否从中找到共性,写成通用的代码呢?于是做了如下尝试:

def test():
    '''
    http://imgsrc.baidu.com/forum/pic/item/e69597510fb30f24ebcb4ec9ca95d143ac4b0347.jpg
    http://imgsrc.baidu.com/forum/pic/item/4c0f7af082025aaf165fdc01f9edab64024f1aa3.jpg

    '''
    import urllib.request
    import requests
    from bs4 import BeautifulSoup as bs
    import re
    print("每个网站的情况并不一致,借鉴此程序后,重新写代码为宜。")
    mychoice=input("是否继续 Y or N:")
    if mychoice=="Y":
        pass
    else:
        exit()
        
    print("如为避免遗漏而需下载网页,请复制网页代码到web.html并输入D。")
    print("如在网上运行,请输入W。")  
    choice=input("Download or Web:")


    webUrlList=[]
    while True:
        webUrl=input("请输入要下载图片所在的完整网站:")
        webUrlList.append(webUrl)
        webChoice=input("是否继续输入网站,Y or N:")
        if webChoice=="N":
            break

    ##webUrl="https://baike.baidu.com/pic/黑小虎传奇/4659511"#点击进入百度黑小虎传奇图册。
    adjust=input("是否需要调整高清图,Y or N:")

    classImage=str(input("请输入obs寻找到的class类别:"))

    pattern='src="..*?"'
    
    zhang=1

    if choice=="D" and adjust=="N":
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all(attrs={"class":classImage})
        n=1
        for i in result:
            myLink=re.findall(pattern,str(i))
            bLink=str(myLink[0])
            print(bLink)
            imageName="图"+str(n)+".jpg"
            urllib.request.urlretrieve(bLink,filename=imageName)
            n+=1
        zhang+=1

    elif choice=="D" and adjust=="Y":

        addLink="watermark,image_d2F0ZXIvYmFpa2UxODA=,g_7,xp_5,yp_5/format,f_auto"
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all("img")
        n=1
        
        for i in result:
            try:
        ##        print(i)
                myLink=re.findall(pattern,str(i))

                aLink=myLink[0]
                aList=aLink.split("/")
                aLink=aList[2][:-1]#需要依据实际情况不断调整。
        ##        print(aList)
                bLink=f"https://bkimg.cdn.bcebos.com/pic/{aLink}?x-bce-process=image/{addLink}"
        ####        bLink=aList[-1]#通过观察,找到更为清晰的图片链接。
                print(bLink)

                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            except:pass
        zhang+=1


    elif choice=="W" and adjust=="Y":
        addLink=input("请根据情况输入图片网址的前半部分:")
        #"http://imgsrc.baidu.com/forum/pic/item/"
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})
            n=1
            for i in result:
                print(i)
                myLink=re.findall(pattern,str(i))#bs是用find_all,而re使用findall
                print(myLink)
                aLink=myLink[0]
                aList=aLink.split("/")
                bLink=addLink+aList[-1]#通过观察,找到更为清晰的图片链接。
                print(bLink)
                imageName=str(zhang)+"图"+str(n)+".jpg"        
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    elif choice=="W" and adjust=="N":
        zhang=1
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})

            n=1
            for i in result:
                myLink=re.findall(pattern,str(i))
                bLink=str(myLink[0])
                print(bLink)
                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    else:print("未能导出图片,请进一步完善程序。")


                


            


网站中的地理图片

国家地理网站中,有一些图片也可以进行爬取。


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("nationalgeographic.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)
    #<img alt="火山口" src="http://image.ngchina.com.cn/2019/1104/20191104100458321.jpg">

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL:{}".format(count, url),file=f)
        print("第{}个URL:{}".format(count, url))
        count += 1
    f.close()

showResults()

注意,爬取时候要获得图片的网页链接


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("file1.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL:{}".format(count, url),file=f)
        print("第{}个URL:{}".format(count, url))
        count += 1
    f.close()

showResults()

网站中的人物图片

在学习爬虫的时候,借鉴了网友的代码,如下所示,其中在运行过程中,发现会有bug需要修正。在一步一步运行代码并修正的时候,也对爬虫有了更深入的了解。

import requests

url="http://www.runoob.com"


header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
#设置headers,网站会根据这个判断你的浏览器及操作系统,很多网站没有此信息将拒绝你访问
#用get方法打开url并发送headers
html = requests.get(url,headers = header)
#print(html.text)


#提取所需要的信息
##将获取的源码转换为BeautifulSoup对象
##使用find搜索需要的数据,保存到容器中
from bs4 import BeautifulSoup

url='http://www.mzitu.com'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html=requests.get(url,headers=header)
#print(html.text)

 
#使用自带的html.parser解析,速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')


#寻找div中的所有a
all_a=soup.find("div",class_="postlist").find_all("a",target="_blank")
##for a in all_a:
##    title=a.get_text()#提取文本
##    print(title)
##
##
##all_div=soup.find("div",class_="postlist")
##for i in all_div:
##    tmp=i.get_text()
##    print(tmp)
#find 返回类型和find_all返回类型不同,find_all才能用get_text()

##page = soup.find_all('a', class_='page-numbers')
##max_page = page[-2].text
###print(max_page)

picture=soup.find("div",class_='postlist').find_all("a",target="_blank")
for everylink in picture:
    #print(everylink)
    
    tmp=everylink.attrs
    #print(tmp)
    
    mytxt=everylink.get_text()
    
    if "href" in everylink.attrs:
        print(f"href={everylink.attrs['href']}",sep="\t")
    
#print(picture)


# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
##same_url = 'https://www.mzitu.com/mm/page/'
##
## 
##for n in range(1, int(max_page) + 1):
##    ul = same_url + str(n)
##    #print(ul)
##    # 分别对当前类每一页第一层url发起请求
##    start_html = requests.get(ul, headers=header)
##    # 提取所有MM的标题
##    soup = BeautifulSoup(start_html.text, "html.parser")
##    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
##    #print(all_a)
##    
##    # 遍历所有MM的标题
##    for a in all_a:
##        # 提取标题文本,作为文件夹名称
##        title = a.get_text()
##        print(title)
##        if(title != ''):
##            print("准备扒取:" + title)
##            if(oa.path.exists(path+title.strip()))
##            
## 
## 


    

于是,经过试错,不断修正,完善为如下代码:
 

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
 
all_url = 'https://www.mzitu.com'
 
# http请求头
Hostreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://www.mzitu.com'
}
# 此请求头Referer破解盗图链接
Picreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://i.meizitu.net'
}
 
# 对mzitu主页all_url发起请求,将返回的HTML数据保存,便于解析
start_html = requests.get(all_url, headers=Hostreferer)
 
# Linux保存地址
# path = '/home/Nick/Desktop/mzitu/'
 
# Windows保存地址
path = 'E:/mzitu/'
 
# 获取最大页数
soup = BeautifulSoup(start_html.text, "html.parser")
page = soup.find_all('a', class_='page-numbers')
max_page = page[-2].text
 
 
# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
same_url = 'https://www.mzitu.com/mm/page/'     # 也可以指定《qingchun MM系列》
 
for n in range(1, int(max_page) + 1):
    # 拼接当前类MM的所有url
    ul = same_url + str(n)
 
    # 分别对当前类每一页第一层url发起请求
    start_html = requests.get(ul, headers=Hostreferer)
 
    # 提取所有MM的标题
    soup = BeautifulSoup(start_html.text, "html.parser")
    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
 
    # 遍历所有MM的标题
    for a in all_a:
        # 提取标题文本,作为文件夹名称
        title = a.get_text()
        if(title != ''):
            print("准备扒取:" + title)
 
            # windows不能创建带?的目录,添加判断逻辑
            if(os.path.exists(path + title.strip().replace('?', ''))):
                # print('目录已存在')
                flag = 1
            else:
                os.makedirs(path + title.strip().replace('?', ''))
                flag = 0
            # 切换到上一步创建的目录
            os.chdir(path + title.strip().replace('?', ''))
 
            # 提取第一层每一个MM的url,并发起请求
            href = a['href']
            html = requests.get(href, headers=Hostreferer)
            mess = BeautifulSoup(html.text, "html.parser")
 
            # 获取第二层最大页数
            pic_max = mess.find_all('span')
            pic_max = pic_max[9].text
            if(flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):
                print('已经保存完毕,跳过')
                continue
 
            # 遍历第二层每张图片的url
            for num in range(1, int(pic_max) + 1):
                # 拼接每张图片的url
                pic = href + '/' + str(num)
 
                # 发起请求
                html = requests.get(pic, headers=Hostreferer)
                mess = BeautifulSoup(html.text, "html.parser")
                pic_url = mess.find('img', alt=title)
                print(pic_url['src'])
                html = requests.get(pic_url['src'], headers=Picreferer)
 
                # 提取图片名字
                file_name = pic_url['src'].split(r'/')[-1]
 
                # 保存图片
                f = open(file_name, 'wb')
                f.write(html.content)
                f.close()
            print('完成')
    print('第', n, '页完成')

个人图书馆中的学习图片

在360doc中,有些图片很利于学习,如何爬取呢?

写如下代码:

import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import re

 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
 
    'limit': '20',
    'sort_by': 'created'
}

 
webUrl="http://www.360doc.com/showweb/0/0/1104723360.aspx"

webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
print(data)


运行后,返回效果如下

<html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>

推测原因,是你需要登录账号,才可以查看。

403 Forbidden是HTTP协议中的一个状态码(Status Code)。可以简单的理解为没有权限访问此站。该状态表示服务器理解了本次请求但是拒绝执行该任务,该请求不该重发给服务器。

既然这样,直接Ctrl + S,保存该网页所有信息。该网页图片也会全部保存下来,反而更有效率。

【心得】

爬取综合信息

网站中的邮箱号码

对于网易中的邮箱号码,也可以进行爬取。



        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息,便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息,区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分,留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
Gupiao()

     
def Ceyan():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))
    print(mycontent[:500])
    print()
    
    myinfor=mysoup.find_all("a")#<a href="http://www.163.com/">网易首页</a>,寻找的是属性,如a,如td,如tr,head,body,
    for i in myinfor:
        tmp=i.get_text()
        print(tmp)
        print(i)
        print()
        print(i.prettify())#用不同的语法格式,看看输出的效果如何。然后就知道各个语句的用法何在。prettify的作用是把密密麻麻的一行输出为整齐的几行,便于阅读。
    #print(myinfor)

Ceyan()

在实践中,也会遇到问题。可以调整代码解决:

from bs4 import BeautifulSoup
import time

htmlFile=open("stock1.html","r",encoding="utf-8")
htmlContent=htmlFile.read()

#time.sleep(10) #暂停10秒

myBS=BeautifulSoup(htmlContent,"html.parser")
#print(myBS)
myLinks=myBS.find_all("a")
#print(myLinks)

for everyLink in myLinks:
	myText=everyLink.get_text()
	#print(myText)

	if "163.com" not in myText:
		print("test")
		print(myText)
		if "href" in everyLink.attrs:#属性attrs
			print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
			#print(myText,":href=",everyLink.attrs['href'],sep="\t")


'''
myBs=BeautifulSoup(htmlcontent,''html.parser'')
mylinks=myBs.find_all('a')

for everylink in mylinks:
	mytext=everylink.get_text()
	if '163.com' not in mytext:
		if"href" in everylink.attrs:
			print(mylink)

问题:为什么163.com还会出现呢?
运行结果:
网易首页:href=http://www.163.com/
新闻:href=http://news.163.com/
体育:href=http://sports.163.com/
NBA:href=http://sports.163.com/nba/
娱乐:href=http://ent.163.com/
财经:href=http://money.163.com/
股票:href=http://money.163.com/stock/
汽车:href=http://auto.163.com/
科技:href=http://tech.163.com/
'''

爬取之后,有序输出。

网站中的大学信息

读取网络文本


import requests
webFile=requests.get("http://www.pku.edu.cn")
webFile.encoding="utf-8"
webFile=webFile.text
print(webFile)

解析网页

import requests
response=requests.get('https://www.pku.edu.cn')
mycode=response.status_code
mycontent=response.content

分析所爬内容


with open(r"E:\pkuCode.txt","r",encoding="utf-8") as myFile:
    data=myFile.readlines()

myList=list(data)
for i in myList:
    print(i)
    input()

解析对象

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
soup.prettify()

items2=soup.find_all(class_="item")


myFile=open(r"E:\mySchoolLink.txt","w",encoding="utf-8")

for everyTag in items2:
    #print(everyTag)

    print(file=myFile)
    print("文字部分",file=myFile)
    myText=everyTag.get_text()
    print(myText,file=myFile)

    print(file=myFile)
    print("链接部分",file=myFile)
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink.attrs,file=myFile)

myFile.close()    

可以用requests ,将对象存储下来:

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()


'''
soup的数据类型是<class 'bs4.BeautifulSoup'>,说明soup是一个BeautifulSoup对象
打印的soup,是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样,却有着不同的内心,它们属于不同的类:<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串,后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本,是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法,所以直接打印 bs 对象显示字符串是str的返回结果
'''

网站中的音乐

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'origin':'https://y.qq.com',
    # 请求来源,本案例中其实是不需要加这个参数的,只是为了演示
    'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',
    # 请求来源,携带的信息比“origin”更丰富,本案例中其实是不需要加这个参数的,只是为了演示
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    # 标记了请求从什么设备,什么浏览器上发出
    }
# 伪装请求头


url1='''
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
'''
url2="p=1"

url3="""
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""
#注意,这个网址是在网页检查过程中找到并复制的,针对文本所在内容的网址,而不是qq音乐的官网。详情参看风变编程笔记。

for i in range(1,10):
    url=url1+"p="+str(i)+url3
    webFile=requests.get(url)
    webFile.encoding="utf-8"
    data=webFile.text

    jsonMusic=webFile.json()
    listMusic = jsonMusic['data']['song']['list']
    
    for i in listMusic:
        print("专辑名:",i["albumname"])
        print("歌曲名:",i["songname"])
        print('播放时长:'+str(i['interval'])+'秒')
        print('播放链接:https://y.qq.com/n/yqq/song/'+i['media_mid']+'.html\n\n')
  

import requests
from bs4 import BeautifulSoup as bs

myHref="https://y.qq.com/n/yqq/singer/0025NhlN2yWrP4.html"
webFile=requests.get(myHref)
data=webFile.text

soup=bs(data,"html.parser")




print("""class_=js_song""")
items1=soup.find_all(class_="js_song")
count=0
for everyLink in items1:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["href"])

    count+=1
    if count==1:
        break

print()

print("""class_=songlist__songname_txt""")
items2=soup.find_all(class_="songlist__songname_txt")

count=0
for everyLink in items2:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["class"])
    count+=1
    if count==1:
        break
'''    
    if "href" in everyLink.attrs:#属性attrs
            print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
            print(myText,":href=",everyLink.attrs['href'],sep="\t")


    注意,bs提取的信息,class很关键。筛选的东西,之后会形成一个字典。
    如果筛选的范围是链接范围,everyLink.attrs["href"]就会出现链接。
    如果筛选的范围是文本范文,就只能写成everyLink.attrs["class"]
'''


import requests
from bs4 import BeautifulSoup as bs

webURL="""
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
p=1
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""

webFile=requests.get(webURL)
webFile.encoding="utf-8"
data=webFile.text

jsonFile=webFile.json()
##print(type(jsonFile))#<class 'dict'>使用json()方法,将对象转为列表/字典
##for (k,v) in jsonFile.items():
##    print(k)

musicData=jsonFile["data"]#注意,文中是引号-字符串,那么得用引号,如果写成jsonFile[data]是没有用的
##print(type(musicData))
##for (k,v) in musicData.items():
##    print(k)


listMusic=musicData["song"]["list"]
print(type(listMusic))
for music in listMusic:
    print("播放专辑:",music["album"]["name"])
    print('播放时长:'+str(music['interval'])+'秒')  # 查找播放时长
    print('播放链接:https://y.qq.com/n/yqq/song/' +music['mid']+'.html\n\n')
    input()
##
##soup=bs(data,"html.parser")
##print(type(soup))#<class 'bs4.BeautifulSoup'>

import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


sheet1.append(["footer_platform_list__item"])
Tag1=soup.find_all(class_="footer_platform_list__item")
for everyTag in Tag1:
    myText=everyTag.get_text()


    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList1=[myText,i['href']]

            print(myList1)
            
            sheet1.append(myList1)


sheet1.append(["footer_link"])
Tag2=soup.find_all(class_="footer_link")
for everyTag in Tag2:
    myText=everyTag.get_text()
    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList2=[myText,i["href"]]
            print(myList2)
            sheet1.append(myList2)
            
workBook.save("积累文档-QQ音乐网络链接.xlsx")



import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


myClass=['footer_platform_list__item','footer_link','footer_download','footer_copyright','footer_platform',"footer_download"]

for everyClass in myClass:
    print(everyClass)
    
    sheet1.append([everyClass])
    Tag1=soup.find_all(class_=everyClass)

    for everyTag in Tag1:
        myText=everyTag.get_text()
        myLinks=everyTag.find_all("a")

        for i in myLinks:
            if "href" in i.attrs:
                myList1=[myText,i["href"]]
                print(myList1)

                sheet1.append(myList1)

workBook.save("积累文档-QQ音乐链接简练版.xlsx")


        

网站中的题目


#爬取网站上的题目
from bs4 import BeautifulSoup
import time
import requests


def Pachong():
    for pageNum in range(1,17):
        htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey='+str(pageNum))
        htmlFile.encoding='utf-8'

        
        soup = BeautifulSoup(htmlFile.text,'html.parser')
        print(soup)
        input()
    #Pachong()


    htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey=2')
    htmlFile.encoding='utf-8'
    print(htmlFile)


def PaTi():
    htmlfile=requests.get("http://vers.cqvip.com/view/course/chapter/detail.aspx?cid=125&chapter=%E6%98%8E%E4%BB%A3%E6%96%87%E5%AD%A6")
    htmlfile.encoding='utf-8'
    mysoup=BeautifulSoup(htmlfile.text,'html.parser')

    mycontent1=mysoup.prettify()
    print(mycontent1[:100])

    
    mycontent2=mysoup.smooth()
    print(mycontent2)
    print("OK")

    mycontent3=mysoup.select_one("div")
    print(mycontent3)
    print("Next")
    print()

    
    myinfor=mysoup.find("div").find_all("strong")
    print(myinfor)
    tmp=mysoup.find_next_sibling("div")
    print(tmp)
    
    
#class="q-box"
PaTi()

        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息,便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息,区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分,留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
#Gupiao()






网站中的博客内容

知乎中的文章

import requests
import csv
#引用csv。
csv_file=open('articles.csv','w',newline='',encoding='utf-8')
#调用open()函数打开csv文件,传入参数:文件名“articles.csv”、写入模式“w”、newline=''。
writer = csv.writer(csv_file)
# 用csv.writer()函数创建一个writer对象。
list2=['标题','链接','摘要']
#创建一个列表
writer.writerow(list2)
#调用writer对象的writerow()方法,可以在csv文件里写入一行文字 “标题”和“链接”和"摘要"。

headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?'
offset=0
#设置offset的起始值为0
while True:
    params={
        'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
        'offset':str(offset),
        'limit':'20',
        'sort_by':'voteups',
        }
    #封装参数
    res=requests.get(url,headers=headers,params=params)
    #发送请求,并把响应内容赋值到变量res里面
    articles=res.json()
    print(articles)
    data=articles['data']
    #定位数据
    for i in data:
        list1=[i['title'],i['url'],i['excerpt']]
        #把目标数据封装成一个列表
        writer.writerow(list1)
        #调用writerow()方法,把列表list1的内容写入
    offset=offset+20
    #在while循环内部,offset的值每次增加20
    if offset > 40:
        break
csv_file.close()
#写入完成后,关闭文件就大功告成
print('okay')   

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

url2="https://www.zhihu.com/org/jing-ji-ri-bao-xin-wen-ke-hu-duan/posts"
webFile=requests.get(url2,params=params,headers=headers)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")
preData=soup.prettify()



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)

爬取博客

from urllib3 import *
from re import *
http=PoolManager()

#禁止显示警告信息
disable_warnings()

#下载url对应web页面
url="https://www.cnblogs.com/"
result=http.request("GET",url)
htmlStr=result.data.decode("utf-8")
print(htmlStr)


#分析html代码
#通过正则表达式,获取所有关于目标的信息
#<a class="post-item-title" href="https://www.cnblogs.com/hzoi-fengwu/p/14922218.html" target="_blank">STL----vector注意事项</a>

aList=findall('<a[^>] *post-item-title[^>]*>[^<]*</a>',htmlStr)
result=[]

#提取每一个<a后面的url

for a in aList:
    #利用正则表达式提取href后面的url
    g=search('href[\s]*=[\s]*[/"][\]',a)
    if g!=None:
        url=g.group(1)

        #得到url
        print(url)

爬取博客标题-爬虫-正则表达式部分

网站中的词典

#网络爬虫进阶urllib.request
def ilovefish():
    import urllib.request
    myResponse=urllib.request.urlopen("https://ilovefishc.com/")#打开网页,获取信息

    myHtml=myResponse.read()#读出数据
    #print(myHtml)
    myHtml=myHtml.decode("utf-8")#将二进制解码,按照网页信息<head> <meta charset="UTF-8">选择解码格式utf-8
    #print(myHtml)


def placekitten():
    #placekitten.com
    import urllib.request
    myResponse=urllib.request.urlopen("http://placekitten.com/500/600")#打开网页,获取信息

    my_cat_img=myResponse.read()#读出数据
    with open('cat_500_600.jpg','wb') as f:
        f.write(my_cat_img)

def myrequest():
    #urllib.request():This function always returns an object which can work as a context manager and has methods such as
    #geturl() — return the URL of the resource retrieved, commonly used to determine if a redirect was followed
    #info() — return the meta-information of the page, such as headers, in the form of an email.message_from_string() instance (see Quick Reference to HTTP Headers)
    #getcode() – return the HTTP status code of the response.


    import urllib.request
    myresponse=urllib.request.urlopen("http://placekitten.com/300/500")
    myurl=myresponse.geturl()
    print(myurl)

    print(myresponse.info())
    print(myresponse.getcode())

def Cidan():
    #小甲鱼将有道辞典功能提取出程序
    import urllib.request()
    url='http://fanyi.youdao.com/'
    data={}

    my_response=urllib.request.urlopen(url,data)

          

【心得】

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值