Python遇见数据采集

最新推荐文章于 2022-11-03 09:48:39 发布

那些很冒险的梦

最新推荐文章于 2022-11-03 09:48:39 发布

阅读量532

点赞数

分类专栏： Python学习文章标签： python url

本文链接：https://blog.csdn.net/JluTiger316/article/details/78421675

版权

Python学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

urllib的用法

urllib是Python3.x中提供的一系列操作URL的库，他可以轻松的模拟用户使用浏览器访问网页。

这里写图片描述

结果输出：

这里写图片描述

打开某网页——f12——Doc——User-Agent（key）+value。
有的网站会根据有没有携带User-Agent来判断当前访问是不是爬虫。

from urllib import request
req=request.Request("http://www.baidu.com/")
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
resp=request.urlopen(req)
resp=request.urlopen(req)
print(resp.read().decode("utf8"))

使用urllib发送post请求

这里写图片描述

from urllib.request import urlopen # 引入urlopen 模块
from urllib.request import Request # 引入urlrequest 模块
from urllib import parse # 引入parse 模块
req = Request("http://www.thsrc.com.tw/tw/TimeTable/SearchResult")

postDate = parse.urlencode([
("StartStation", "977abb69-413a-4ccf-a109-0272c24fd490"),
("EndStation", "9c5ac6ca-ec89-48f8-aab0-41b738cb1814"),
("SearchDate", "2017/11/02"),
("SearchTime", "10:00"),
("SearchWay", "DepartureInMandarin")
])#Doc中form data
req.add_header("Origin", "http://www.thsrc.com.tw")#如果不携带req.add_header两个信息，可能会拒绝访问
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:48.0) Gecko/20100101 Firefox/48.0)")
resp = urlopen(req,data=postDate.encode("utf-8"))
print(resp.read().decode("utf-8"))

结果输出：

这里写图片描述

Beautiful Soup

Beautiful Soup是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.
中文文档

# Author:JluTiger
from bs4 import BeautifulSoup as bs
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = bs(html_doc,"html.parser")#html.parser为解析器，不添加可能会有警告
print(soup.prettify())

运行结果：

这里写图片描述

几个简单的浏览结构化数据的方法:

soup.title
# <title>The Dormouse's story</title>

soup.title.name
# u'title'

soup.title.string
# u'The Dormouse's story'

soup.title.parent.name
# u'head'

soup.p
# <p class="title"><b>The Dormouse's story</b></p>

soup.p['class']
# u'title'

soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

获取维基百科词条信息：

# 引入开发包
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
# 请求URL并把结果用utf-8编码
resp=urlopen("https://en.wikipedia.org/wiki/Wiki").read().decode("utf-8")
# 使用Beautiful Soup去解析
soup=BeautifulSoup(resp,"html.parser")
# 获取所有以wiki开头的a标签的href属性
listUrls=soup.findAll("a",href=re.compile("^/wiki/"))
# 输出所有的词条对应的名称和URL
for url in listUrls:
    # 过滤以.jpg或者.JPG结尾的URL
    if not re.search("\.(jpg|JPG)$",url["href"]):
        # 输出URL的文字和对应的链接
        # string只能获取一个，get_text（）获取标签下所有的文字
        print(url.get_text(),'<--->',"https://en.wikipedia.org"+url["href"])

输出结果：

这里写图片描述

存储数据到MySQL

# 引入开发包
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql.cursors
# 请求URL并把结果用utf-8编码
resp=urlopen("https://en.wikipedia.org/wiki/Wiki").read().decode("utf-8")
# 使用Beautiful Soup去解析
soup=BeautifulSoup(resp,"html.parser")
# 获取所有以wiki开头的a标签的href属性
listUrls=soup.findAll("a",href=re.compile("^/wiki/"))
# 输出所有的词条对应的名称和URL
for url in listUrls:
    # 过滤以.jpg或者.JPG结尾的URL
    if not re.search("\.(jpg|JPG)$",url["href"]):
        # 输出URL的文字和对应的链接
        # string只能获取一个，get_text（）获取标签下所有的文字
        print(url.get_text(),'<--->',"https://en.wikipedia.org"+url["href"])
        # 获取数据库链接
        connection = pymysql.connect(host='127.0.0.1',
                                     port=3306,
                                     user='root',
                                     password='*******',
                                     db='imooc',
                                     charset='utf8')
        try:
            #获取会话指针
            with connection.cursor() as cursor:
                # 创建sql语句
                for url in listUrls:
                    sql = "insert into `wikiurl`(`urlname`,`urlhref`)values(%s,%s)"
                    # 执行sql语句
                    cursor.execute(sql, (url.get_text(), "https://en.wikipedia.org" + url["href"]))
                    # 提交
                    connection.commit();
        finally:
            connection.close();

数据库结果：

这里写图片描述

读取MySQL数据的方法

常见文件的读取

txt：

from urllib.request import urlopen
html=urlopen("http://en.wikipedia.org/robots.txt")
print(html.read().decode("utf-8"))

这里写图片描述

pdf：

首先要安装pdfminer3k下载地址。

from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

fp = open("1.pdf","rb") # 打开对象，使用二进制方式
parser = PDFParser(fp) # 创建对应的解释器，传入文件对象，可理解为解释文件
doc = PDFDocument() # 创建文档对象
parser.set_document(doc) # 两步方法将fp的解释器和doc文档关联起来
doc.set_parser(parser) # 两步方法将fp的解释器和doc文档关联起来
doc.initialize() # 关联了解释器的文档，进行初始化

resource = PDFResourceManager() # 创建pdf的资源管理器
laparams = LAParams() # 创建pdf的参数分析器
device = PDFPageAggregator(resource,laparams=laparams) # 使用聚合器将资源管理器和参数分析器聚合在一起
interpreter = PDFPageInterpreter(resource,device) # 创建页面解析器，将资源管理器和聚合其结合在一起

for page in doc.get_pages(): # 获取文档对象的每一页
    interpreter.process_page(page) # 使用页面解析器解析每一页
    layout = device.get_result() # 使用聚合其获取解析的结果
    for out in layout: # 遍历获取的结果
        if hasattr(out,"get_text"):
            print(out.get_text()) # 输出