人工智能学习笔记：Python爬虫开发

最新推荐文章于 2024-09-15 22:31:42 发布

这个需求做不l

最新推荐文章于 2024-09-15 22:31:42 发布

阅读量365

点赞数 1

分类专栏：人工智能文章标签： python 爬虫搜索引擎 java爬虫程序爬虫搜索关键字搜索数据抓取爬虫 jsoup 数据挖掘自然语言处理

本文链接：https://blog.csdn.net/mingzme/article/details/107348193

版权

人工智能专栏收录该内容

2 篇文章 0 订阅

订阅专栏

一、爬虫介绍与常用工具

第一个爬虫

from urllib.request import urlopen
url = 'http://www.baidu.com'
response = urlopen(url)
print(response.read().decode())
print(response.getcode())  		#返回状态码
print(response.geturl())		#实际访问的url
print(response.info())			#http响应头

get请求

from urllib.request import urlopen,Request
from urllib.parse import quote
from urllib.parse import urlencode
args = {
    'wd':"尚学堂",
    'ie':'utf-8'
}
print(urlencode(args)) 		 #wd=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8
# url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(quote("尚学堂"))
url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(urlencode(args))
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
req = Request(url,headers=headers)
print(urlopen(req).read().decode())

post请求：

from urllib.request import urlopen,Request
from urllib.parse import urlencode
url = 'https://www.baidu.com/'
args = {
    'user':'111111',
    'password':'123456'
}
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
req = Request(url,headers=headers,data=urlencode(args))
print(urlopen(req).read().decode())

https请求问题

import ssl
context = ssl._create_unverified_context() #忽略ssl安全认证
print(urlopen(url,context=context).read().decode())

二、爬虫常用开发模块

动态UserAgent的使用

from fake_useragent import UserAgent
ua = UserAgent()
print(ua.ie)
print(ua.chrome)
print(ua.random)

opener的使用

from urllib.request import urlopen,Request
url = "http://httpbin.org/get"
from fake_useragent import UserAgent
headers = {
    "User-Agent": UserAgent().random
}
req = Request(url,headers=headers)
from urllib.request import  build_opener,HTTPHandler
handler = HTTPHandler(debuglevel=1) 		#打印信息
opener = build_opener(handler)
resp = opener.open(req)
#print(resp.read().decode())

proxy代理的使用

from urllib.request import  build_opener,ProxyHandler
# handler = ProxyHandler({"http":"name:password@ip:port"})
handler = ProxyHandler({"http":"211.137.52.158:8080"})
opener = build_opener(handler)

cookie使用

from urllib.request import HTTPCookieProcessor
handler = HTTPCookieProcessor()		#可以保存cookie
opener = build_opener(handler)

cookie的保存与使用

#cookie的保存
from http.cookiejar import MozillaCookieJar
cookie_jar = MozillaCookieJar()
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
resp = opener.open(req)
cookie_jar.set_cookie('cookie.txt', ignore_discard=True, ignore_expires=True)
#cookie的使用
from http.cookiejar import MozillaCookieJar
cookie_jar = MozillaCookieJar()
cookie_jar.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
resp = opener.open(req)

捕获异常URLError
```
from urllib.error import URLError
```

requests的使用

get请求

import requests
url = "http://httpbin.org/get"
proxy = {									
    "http":"http://211.137.52.158:8080"		#设置代理
}
headers = {"User-Agent":UserAgent().random}
resp = requests.get(url,headers=headers,proxies=proxy)
print(resp.url)
resp.encoding = 'utf-8'
print(resp.text)

session自动保存cookies

s = requests.Session() 
# 用session对象发出get请求，设置cookies 
s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')

ssl验证

# 禁用安全请求警告
requests.packages.urllib3.disable_warnings()
resp = requests.get(url, verify=False, headers=headers)

三、数据提取与验证码识别

正则表达式的使用：https://blog.csdn.net/mingzme/article/details/107250157
```
f1 = re.match(r"/w",str)
s1 = re.sub(r"every_day","EveryDay",str)	#t替换
```

数据提取-Beautiful Soup：https://blog.csdn.net/mingzme/article/details/107250908

from bs4 import BeautifulSoup
soup = BeautifulSoup(str, 'lxml')
print(soup.title)
a = soup.select('css表达式')[0].text		#css表达式
a.get('href')							  #获得元素的属性

数据提取-XPath：https://blog.csdn.net/mingzme/article/details/107252400

from lxml import etree
url='https://www.qidian.com/rank/fengyun?style=1&year=2018&month=08'
headers = {"User-Agent":UserAgent().chrome}
resp = requests.get(url,headers=headers)
e = etree.HTML(resp.text)
names = e.xpath('//div[@class="book-mid-info"]/h4/a/text()')
authors = e.xpath('//p[@class="author"]/a[1]/text()')
for name,author in zip(names,authors):
    print(name +":"+ author)

数据提取-PyQuery：https://blog.csdn.net/mingzme/article/details/107255479

from pyquery import PyQuery
doc = PyQuery(resp.text)
names = [a.text for a in doc('h4 a')]	()中是css表达式
print(names)

数据提取-jsonpath：https://blog.csdn.net/mingzme/article/details/107299928

json

str = '{"name":"盗梦空间"}'
obj = json.loads(str)       							#字符串转字典对象
obj_str = json.dumps(obj,ensure_ascii=False)    		#字典对象转字符串
#对象保存到文件
json.dump(obj,open('movie.txt','w',encoding='utf-8'),ensure_ascii=False)
obj2 = json.load(open('movie.txt',encoding='utf-8'))	#文件转对象

jsonpath使用

from  jsonpath import jsonpath
names = jsonpath(json.loads(resp.text), '$..name')
ids = jsonpath(resp.json(),"$..id")

Tesseract识别文字(需要安装)

import pytesseract
from PIL import Image
img = Image.open('yzm1.jpg')
str = pytesseract.image_to_string(img)
print(str)

selenium与PhantomJS游览器自动化插件：https://blog.csdn.net/mingzme/article/details/107303299

from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')          #开启无头浏览器模式
options.add_argument('--proxy-server=http://ip:port')	#设置代理
chrome = webdriver.Chrome(chrome_options=options)
chrome.get("https://cn.bing.com/")
chrome.find_element_by_id('sb_form_q').send_keys('python')	#输入框输入需要查询内容
chrome.find_element_by_id('sb_form_go').click()				#点击查询按钮
chrome.save_screenshot('baidu.png') #截图
js = 'document.documentElements.scrollTop=1000000'
chrome.execute_script(js)			#拉滚动条
html = chrome.page_source   #获取源代码
chrome.quit()       #关闭浏览器

四、Scrapy框架简介与配置

创建项目
```
scrapy startproject myfrist
```

创建爬虫

scrapy genspider 爬虫名 爬虫的地址

运行爬虫

scrapy crawl 爬虫名
scrapy crawl 爬虫名 -o douban.json -t json  
#方法二
from scrapy.cmdline import execute
execute('scrapy crawl movie'.split())

案例

#movie.py
    def parse(self, response):
        names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
        stars = response.xpath('//span[@class="rating_num"]/text()').extract()
        item = DoubanItem()
        for name, star in zip(names, stars):
            item['name'] = name
            item['star'] = star
            yield item'
            
#items.py
class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    star = scrapy.Field()
    
#pipelines.py
from json import dumps
class DoubanPipeline:
    def open_spider(self,spider):		#爬虫开始时运行
        self.filename = open('movies.txt','w',encoding='utf-8')

    def process_item(self, item, spider):
        self.filename.write(dumps(dict(item),ensure_ascii=False)+"\n")
        return item

    def close_spider(self,spider):		#爬虫结束时运行
        self.filename.close()

settings.py的设置内容：https://blog.csdn.net/mingzme/article/details/107322777

案例

#qu.py
class QuSpider(scrapy.Spider):
    name = 'qu'
    allowed_domains = ['qu.la']
    start_urls = ['https://www.qu.la/book/4703/2014176.html']

    def parse(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace('　　　　    ','\n')
        next_url = response.xpath('//div[@class="section-opt"]/a[3]/@href').extract_first()

        yield{
            'title':title,
            'content':content
        }
        #继续爬取下一个url
        yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
        
#pipelines.py
class FictionPipeline:
    def open_spider(self,spider):
        self.filename = open('fiction.txt','w',encoding='utf-8')

    def process_item(self, item, spider):
        info = item['title'] + '\n' + item['content'] + "\n"
        self.filename.write(info+'\n\n\n')
        self.filename.flush()
        return item

    def close_spider(self,spider):
        self.filename.close()

五、scrapy框架高级

crawlspider的使用

创建爬虫
```
scrapy genspider qu3 qu.la -t crawl
```

案例

class Qu3Spider(CrawlSpider):
    name = 'qu3'
    allowed_domains = ['qu.la']
    start_urls = ['https://www.qu.la/book/4703/']

    rules = (
        Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[13]/a'), callback='parse_item', follow=True),#爬取第一章
        Rule(LinkExtractor(restrict_xpaths=r'//div[@class="section-opt"]/a[3]'), callback='parse_item', follow=True),
    )						

    def parse_item(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace('　　　　    ', '\n')
        yield {
            'title': title,
            'content': content
        }

imagepipline 下载图片

#zol.py
    def parse(self, response):
        image_url = response.xpath('//img[@id="bigImg"]/@src').extract_first()
        image_name = response.xpath('string(//h3)').extract_first()
        yield {
            'image_urls' : [image_url],		#不重写方法的话名字固定
            'image_name' : image_name
        }
        next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
        yield scrapy.Request(response.urljoin(next_url),callback=self.parse)
        
#pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class ImagePipeline(ImagesPipeline):		#继承ImagesPipeline

    def get_media_requests(self, item, info):
        return Request(item['image_urls'], meta={'name' : item['image_name']})

    def file_path(self, request, response=None, info=None):		#改变图片名字
        name = request.meta['name'].strip().replace('\r\n\t\t','')
        name = name.replace('/','-')
        return name+'.jpg'
    
 #settings.py
ITEM_PIPELINES = {
   'scrapy.pipelines.images.ImagesPipeline': 300,
   'image.pipelines.ImagePipeline' : 300,			#重写方法的加进去
}
IMAGES_STORE='C:/Users/Mingz/Desktop/PythonLab/imagee'

中间件：动态UA设置

#middlewares.py
#from image.settings import USER_AGENTS
#from random import choice
class UserAgentMiddlerware:
	def process_request(self, request, spider):
		request.headers.setdefault(b'User-Agent',UserAgent().random)
#setting.py
DOWNLOADER_MIDDLEWARES = {
   'image.middlewares.ImageDownloaderMiddleware': 343,	#调高优先级
}
USER_AGENT=[
    'a','b','c'
]

中间件：动态代理

#middlewares.py
class ProxyMiddlerware:
	def process_request(self,request, spider):
		request.meta['proxy'] = 'http://uname:password@ip:port'
#setting.py
DOWNLOADER_MIDDLEWARES = {
   'image.middlewares.ProxyMiddlerware':344
}

登陆表单

class FilterSpider(scrapy.Spider):
    name = 'filter'
    allowed_domains = ['baicu.com']
    
    def start_requests(self):
        url = 'https://www.baidu.com'
        form_data= {
            'user':'user',
            'password':'pwd'
        }
        for num in range(3):
            yield scrapy.FormRequest(url, callback=self.parse, formdata=form_data, dont_filter=True,cookie="..")#True 不再去重 传表单 cookie要传入字典对象

六、爬虫数据存储

mongodb数据库使用

show dbs
db.createCollection('student')
db.dropDatabase()
show tables
show collections	#和show tables 一样
db.student.drop()

crud操作

db.student.save([{name:"刘备"},{name:"董卓"}])		#id重复则覆盖
db.student.insert({name:"刘备"})						#id重复则报错
db.student.update({name:"刘备"},{age:33,name:"刘备"})	
db.student.update({name:"刘备"},{$set:{age:18}},{multi:true})	#更新多条
db.student.remove({name:"刘备"},{justOne:true})
db.student.remove({})			#删除所有数据
db.student.find().limit(3).skip(6).sort({age:1})	#1升序 -1降序
db.student.find({country:"魏国"}).count()
db.student.find({$or:[{age:{$lt:25}},{country:'魏国'}]})       #小于25的或者...
db.student.find({age:{$in:[25,28]}})
db.student.find({name:/^曹/})			#模糊匹配 姓曹的人
db.student.find({name:{$regex:"^曹"}})
db.student.find({$where:function(){return this.age>=23}})	#自定义查询
db.student.distinct('country')			#去重
db.student.find({'age':{$exists:true}})

Mongo与Python的交互

from pymongo import MongoClient
client = MongoClient()
school = client.school  #获取数据库实例
student = school.student    #获取集合
stus = student.find()
print(stus.next())
stu = student.find_one({"country":"蜀国"})
stus = student.find().skip(6).limit(6)
# stus = student.find().sort("age",pymongo.DESCENDING)
stu = {"name":"诸葛亮","country":"蜀国"}
student.insert_one(stu)
student.update_one({"name":"诸葛亮"},{"$set":{"age":30}})
student.delete_many({"name":"诸葛亮"})

爬取数据保存到数据库

#Mongo数据库
from pymongo import MongoClient

class MongoDemoPipeline:
    def open_spider(self,spider):
        self.client = MongoClient()
        self.db = self.client.movie
        self.collection = self.db.collection

    def process_item(self, item, spider):
        self.collection.insert(item)
        return item

    def close_spider(self,spider):
        self.client.close()
        
 #Mysql 数据库
 class MysqlPipeline:
    def open_spider(self,spider):
        self.client = connect(host='localhost', port=3306, user='root', password='root', db='test01')
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        sql = 'insert into t_maoyan values(0,%s,%s)'
        self.cursor.execute(sql, [item['name'],item['star']])
        self.client.commit()
        return item

    def close_spider(self,spider):
        self.cursor.close()
        self.client.close()
 #setting.py
   ITEM_PIPELINES = {
   'mongo_demo.pipelines.MongoDemoPipeline': 300,
   'mongo_demo.pipelines.MysqlPipeline': 301,
}

七、动态数据抓取

Splash与python：https://blog.csdn.net/mingzme/article/details/107339895

url = 'https://www.guazi.com/hengshui/buy/'
base_url = 'http://192.168.99.100:8050/render.html?url={}&wait=2'.format(url)
resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})

import requests
from urllib.parse import quote
from fake_useragent import UserAgent

url = 'https://www.guazi.com/hengshui/buy/'
lua_script = '''
function main(splash, args)
  assert(splash:go('{}'))
  assert(splash:wait(0.5))
  return splash:html()
end
'''.format(url)
base_url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua_script)
resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})

splash与scrapy

#settings.py
SPLASH_URL = 'http://192.168.99.100:8050/'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
  'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

#guaizi.py
from scrapy_splash import SplashRequest
class Guazi1Spider(scrapy.Spider):
    name = 'guazi1'
    allowed_domains = ['guazi.com']
    
    def start_requests(self):
        url = 'https://www.guazi.com/bj/buy'
        yield SplashRequest(url, callback=self.parse,args={'wait':2})
    def parse(self, response):
        print(response.text)
        
#guaizi2.py
    def start_requests(self):
        url = 'https://www.guazi.com/hengshui/buy/'
        lua_script = '''
        function main(splash, args)
          assert(splash:go(args.url))
          assert(splash:wait(0.5))
          return splash:html()
        end
        '''
        yield SplashRequest(url, callback=self.parse, endpoint='execute',args={'lua_source':lua_script})

selenium 与 scrapy的结合

#baidu.py
import scrapy
from selenium import webdriver
from scrapy import signals
class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    allowed_domains = ['baidu.com']
    start_urls = ['http://www.baidu.com/']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BaiduSpider, cls).from_crawler(crawler, *args, **kwargs) #初始化爬虫对象
        spider.driver = webdriver.Chrome()
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)#捕捉信号
        return spider

    def spider_closed(self, spider):
        spider.driver.close()

    def parse(self, response):
        print(response.text)
middlewares.py
from scrapy.http import HtmlResponse
class SeleniumMiddleware:
    def process_request(self, request, spider):
        spider.driver.get(request.url)
        html = spider.driver.page_source
        return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8')       #不会再走后面和下载器
#setting.py
DOWNLOADER_MIDDLEWARES = {
   'selenium_demo.middlewares.SeleniumMiddleware': 543,
}