一、爬虫介绍与常用工具
-
第一个爬虫
from urllib.request import urlopen url = 'http://www.baidu.com' response = urlopen(url) print(response.read().decode()) print(response.getcode()) #返回状态码 print(response.geturl()) #实际访问的url print(response.info()) #http响应头
-
get请求
from urllib.request import urlopen,Request from urllib.parse import quote from urllib.parse import urlencode args = { 'wd':"尚学堂", 'ie':'utf-8' } print(urlencode(args)) #wd=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8 # url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(quote("尚学堂")) url = 'https://www.baidu.com/s?ie=UTF-8&wd={}'.format(urlencode(args)) headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } req = Request(url,headers=headers) print(urlopen(req).read().decode())
-
post请求:
from urllib.request import urlopen,Request from urllib.parse import urlencode url = 'https://www.baidu.com/' args = { 'user':'111111', 'password':'123456' } headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } req = Request(url,headers=headers,data=urlencode(args)) print(urlopen(req).read().decode())
-
https请求问题
import ssl context = ssl._create_unverified_context() #忽略ssl安全认证 print(urlopen(url,context=context).read().decode())
二、爬虫常用开发模块
-
动态UserAgent的使用
from fake_useragent import UserAgent ua = UserAgent() print(ua.ie) print(ua.chrome) print(ua.random)
-
opener的使用
from urllib.request import urlopen,Request url = "http://httpbin.org/get" from fake_useragent import UserAgent headers = { "User-Agent": UserAgent().random } req = Request(url,headers=headers) from urllib.request import build_opener,HTTPHandler handler = HTTPHandler(debuglevel=1) #打印信息 opener = build_opener(handler) resp = opener.open(req) #print(resp.read().decode())
-
proxy代理的使用
from urllib.request import build_opener,ProxyHandler # handler = ProxyHandler({"http":"name:password@ip:port"}) handler = ProxyHandler({"http":"211.137.52.158:8080"}) opener = build_opener(handler)
-
cookie使用
from urllib.request import HTTPCookieProcessor handler = HTTPCookieProcessor() #可以保存cookie opener = build_opener(handler)
-
cookie的保存与使用
#cookie的保存 from http.cookiejar import MozillaCookieJar cookie_jar = MozillaCookieJar() handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) resp = opener.open(req) cookie_jar.set_cookie('cookie.txt', ignore_discard=True, ignore_expires=True) #cookie的使用 from http.cookiejar import MozillaCookieJar cookie_jar = MozillaCookieJar() cookie_jar.load('cookie.txt', ignore_discard=True, ignore_expires=True) handler = HTTPCookieProcessor(cookie_jar) opener = build_opener(handler) resp = opener.open(req)
-
捕获异常URLError
from urllib.error import URLError
-
requests的使用
-
get请求
import requests url = "http://httpbin.org/get" proxy = { "http":"http://211.137.52.158:8080" #设置代理 } headers = {"User-Agent":UserAgent().random} resp = requests.get(url,headers=headers,proxies=proxy) print(resp.url) resp.encoding = 'utf-8' print(resp.text)
-
session自动保存cookies
s = requests.Session() # 用session对象发出get请求,设置cookies s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
-
ssl验证
# 禁用安全请求警告 requests.packages.urllib3.disable_warnings() resp = requests.get(url, verify=False, headers=headers)
-
三、数据提取与验证码识别
-
正则表达式的使用:https://blog.csdn.net/mingzme/article/details/107250157
f1 = re.match(r"/w",str) s1 = re.sub(r"every_day","EveryDay",str) #t替换
-
数据提取-Beautiful Soup:https://blog.csdn.net/mingzme/article/details/107250908
from bs4 import BeautifulSoup soup = BeautifulSoup(str, 'lxml') print(soup.title) a = soup.select('css表达式')[0].text #css表达式 a.get('href') #获得元素的属性
-
数据提取-XPath:https://blog.csdn.net/mingzme/article/details/107252400
from lxml import etree url='https://www.qidian.com/rank/fengyun?style=1&year=2018&month=08' headers = {"User-Agent":UserAgent().chrome} resp = requests.get(url,headers=headers) e = etree.HTML(resp.text) names = e.xpath('//div[@class="book-mid-info"]/h4/a/text()') authors = e.xpath('//p[@class="author"]/a[1]/text()') for name,author in zip(names,authors): print(name +":"+ author)
-
数据提取-PyQuery:https://blog.csdn.net/mingzme/article/details/107255479
from pyquery import PyQuery doc = PyQuery(resp.text) names = [a.text for a in doc('h4 a')] ()中是css表达式 print(names)
-
数据提取-jsonpath:https://blog.csdn.net/mingzme/article/details/107299928
-
json
str = '{"name":"盗梦空间"}' obj = json.loads(str) #字符串转字典对象 obj_str = json.dumps(obj,ensure_ascii=False) #字典对象转字符串 #对象保存到文件 json.dump(obj,open('movie.txt','w',encoding='utf-8'),ensure_ascii=False) obj2 = json.load(open('movie.txt',encoding='utf-8')) #文件转对象
-
jsonpath使用
from jsonpath import jsonpath names = jsonpath(json.loads(resp.text), '$..name') ids = jsonpath(resp.json(),"$..id")
-
-
Tesseract识别文字(需要安装)
import pytesseract from PIL import Image img = Image.open('yzm1.jpg') str = pytesseract.image_to_string(img) print(str)
-
selenium与PhantomJS游览器自动化插件:https://blog.csdn.net/mingzme/article/details/107303299
from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument('--headless') #开启无头浏览器模式 options.add_argument('--proxy-server=http://ip:port') #设置代理 chrome = webdriver.Chrome(chrome_options=options) chrome.get("https://cn.bing.com/") chrome.find_element_by_id('sb_form_q').send_keys('python') #输入框输入需要查询内容 chrome.find_element_by_id('sb_form_go').click() #点击查询按钮 chrome.save_screenshot('baidu.png') #截图 js = 'document.documentElements.scrollTop=1000000' chrome.execute_script(js) #拉滚动条 html = chrome.page_source #获取源代码 chrome.quit() #关闭浏览器
四、Scrapy框架简介与配置
-
创建项目
scrapy startproject myfrist
-
创建爬虫
scrapy genspider 爬虫名 爬虫的地址
-
运行爬虫
scrapy crawl 爬虫名 scrapy crawl 爬虫名 -o douban.json -t json #方法二 from scrapy.cmdline import execute execute('scrapy crawl movie'.split())
-
案例
#movie.py def parse(self, response): names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract() stars = response.xpath('//span[@class="rating_num"]/text()').extract() item = DoubanItem() for name, star in zip(names, stars): item['name'] = name item['star'] = star yield item' #items.py class DoubanItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() star = scrapy.Field() #pipelines.py from json import dumps class DoubanPipeline: def open_spider(self,spider): #爬虫开始时运行 self.filename = open('movies.txt','w',encoding='utf-8') def process_item(self, item, spider): self.filename.write(dumps(dict(item),ensure_ascii=False)+"\n") return item def close_spider(self,spider): #爬虫结束时运行 self.filename.close()
-
settings.py的设置内容:https://blog.csdn.net/mingzme/article/details/107322777
-
案例
#qu.py class QuSpider(scrapy.Spider): name = 'qu' allowed_domains = ['qu.la'] start_urls = ['https://www.qu.la/book/4703/2014176.html'] def parse(self, response): title = response.xpath('//h1/text()').extract_first() content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace(' ','\n') next_url = response.xpath('//div[@class="section-opt"]/a[3]/@href').extract_first() yield{ 'title':title, 'content':content } #继续爬取下一个url yield scrapy.Request(response.urljoin(next_url), callback=self.parse) #pipelines.py class FictionPipeline: def open_spider(self,spider): self.filename = open('fiction.txt','w',encoding='utf-8') def process_item(self, item, spider): info = item['title'] + '\n' + item['content'] + "\n" self.filename.write(info+'\n\n\n') self.filename.flush() return item def close_spider(self,spider): self.filename.close()
五、scrapy框架高级
-
crawlspider的使用
-
创建爬虫
scrapy genspider qu3 qu.la -t crawl
-
案例
class Qu3Spider(CrawlSpider): name = 'qu3' allowed_domains = ['qu.la'] start_urls = ['https://www.qu.la/book/4703/'] rules = ( Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[13]/a'), callback='parse_item', follow=True),#爬取第一章 Rule(LinkExtractor(restrict_xpaths=r'//div[@class="section-opt"]/a[3]'), callback='parse_item', follow=True), ) def parse_item(self, response): title = response.xpath('//h1/text()').extract_first() content = response.xpath('string(//div[@class="content"])').extract_first().strip().replace(' ', '\n') yield { 'title': title, 'content': content }
-
-
imagepipline 下载图片
#zol.py def parse(self, response): image_url = response.xpath('//img[@id="bigImg"]/@src').extract_first() image_name = response.xpath('string(//h3)').extract_first() yield { 'image_urls' : [image_url], #不重写方法的话名字固定 'image_name' : image_name } next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first() yield scrapy.Request(response.urljoin(next_url),callback=self.parse) #pipelines.py from scrapy.pipelines.images import ImagesPipeline from scrapy import Request class ImagePipeline(ImagesPipeline): #继承ImagesPipeline def get_media_requests(self, item, info): return Request(item['image_urls'], meta={'name' : item['image_name']}) def file_path(self, request, response=None, info=None): #改变图片名字 name = request.meta['name'].strip().replace('\r\n\t\t','') name = name.replace('/','-') return name+'.jpg' #settings.py ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': 300, 'image.pipelines.ImagePipeline' : 300, #重写方法的加进去 } IMAGES_STORE='C:/Users/Mingz/Desktop/PythonLab/imagee'
-
中间件:动态UA设置
#middlewares.py #from image.settings import USER_AGENTS #from random import choice class UserAgentMiddlerware: def process_request(self, request, spider): request.headers.setdefault(b'User-Agent',UserAgent().random) #setting.py DOWNLOADER_MIDDLEWARES = { 'image.middlewares.ImageDownloaderMiddleware': 343, #调高优先级 } USER_AGENT=[ 'a','b','c' ]
-
中间件:动态代理
#middlewares.py class ProxyMiddlerware: def process_request(self,request, spider): request.meta['proxy'] = 'http://uname:password@ip:port' #setting.py DOWNLOADER_MIDDLEWARES = { 'image.middlewares.ProxyMiddlerware':344 }
-
登陆表单
class FilterSpider(scrapy.Spider): name = 'filter' allowed_domains = ['baicu.com'] def start_requests(self): url = 'https://www.baidu.com' form_data= { 'user':'user', 'password':'pwd' } for num in range(3): yield scrapy.FormRequest(url, callback=self.parse, formdata=form_data, dont_filter=True,cookie="..")#True 不再去重 传表单 cookie要传入字典对象
六、爬虫数据存储
-
mongodb数据库使用
show dbs db.createCollection('student') db.dropDatabase() show tables show collections #和show tables 一样 db.student.drop()
-
crud操作
db.student.save([{name:"刘备"},{name:"董卓"}]) #id重复则覆盖 db.student.insert({name:"刘备"}) #id重复则报错 db.student.update({name:"刘备"},{age:33,name:"刘备"}) db.student.update({name:"刘备"},{$set:{age:18}},{multi:true}) #更新多条 db.student.remove({name:"刘备"},{justOne:true}) db.student.remove({}) #删除所有数据 db.student.find().limit(3).skip(6).sort({age:1}) #1升序 -1降序 db.student.find({country:"魏国"}).count() db.student.find({$or:[{age:{$lt:25}},{country:'魏国'}]}) #小于25的或者... db.student.find({age:{$in:[25,28]}}) db.student.find({name:/^曹/}) #模糊匹配 姓曹的人 db.student.find({name:{$regex:"^曹"}}) db.student.find({$where:function(){return this.age>=23}}) #自定义查询 db.student.distinct('country') #去重 db.student.find({'age':{$exists:true}})
-
Mongo与Python的交互
from pymongo import MongoClient client = MongoClient() school = client.school #获取数据库实例 student = school.student #获取集合 stus = student.find() print(stus.next()) stu = student.find_one({"country":"蜀国"}) stus = student.find().skip(6).limit(6) # stus = student.find().sort("age",pymongo.DESCENDING) stu = {"name":"诸葛亮","country":"蜀国"} student.insert_one(stu) student.update_one({"name":"诸葛亮"},{"$set":{"age":30}}) student.delete_many({"name":"诸葛亮"})
-
爬取数据保存到数据库
#Mongo数据库 from pymongo import MongoClient class MongoDemoPipeline: def open_spider(self,spider): self.client = MongoClient() self.db = self.client.movie self.collection = self.db.collection def process_item(self, item, spider): self.collection.insert(item) return item def close_spider(self,spider): self.client.close() #Mysql 数据库 class MysqlPipeline: def open_spider(self,spider): self.client = connect(host='localhost', port=3306, user='root', password='root', db='test01') self.cursor = self.client.cursor() def process_item(self, item, spider): sql = 'insert into t_maoyan values(0,%s,%s)' self.cursor.execute(sql, [item['name'],item['star']]) self.client.commit() return item def close_spider(self,spider): self.cursor.close() self.client.close() #setting.py ITEM_PIPELINES = { 'mongo_demo.pipelines.MongoDemoPipeline': 300, 'mongo_demo.pipelines.MysqlPipeline': 301, }
七、动态数据抓取
-
Splash与python:https://blog.csdn.net/mingzme/article/details/107339895
url = 'https://www.guazi.com/hengshui/buy/' base_url = 'http://192.168.99.100:8050/render.html?url={}&wait=2'.format(url) resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
import requests from urllib.parse import quote from fake_useragent import UserAgent url = 'https://www.guazi.com/hengshui/buy/' lua_script = ''' function main(splash, args) assert(splash:go('{}')) assert(splash:wait(0.5)) return splash:html() end '''.format(url) base_url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua_script) resp = requests.get(base_url, headers={'User-Agent': UserAgent().chrome})
-
splash与scrapy
#settings.py SPLASH_URL = 'http://192.168.99.100:8050/' DOWNLOADER_MIDDLEWARES = { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, } SPIDER_MIDDLEWARES = { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100 } DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' #guaizi.py from scrapy_splash import SplashRequest class Guazi1Spider(scrapy.Spider): name = 'guazi1' allowed_domains = ['guazi.com'] def start_requests(self): url = 'https://www.guazi.com/bj/buy' yield SplashRequest(url, callback=self.parse,args={'wait':2}) def parse(self, response): print(response.text) #guaizi2.py def start_requests(self): url = 'https://www.guazi.com/hengshui/buy/' lua_script = ''' function main(splash, args) assert(splash:go(args.url)) assert(splash:wait(0.5)) return splash:html() end ''' yield SplashRequest(url, callback=self.parse, endpoint='execute',args={'lua_source':lua_script})
-
selenium 与 scrapy的结合
#baidu.py import scrapy from selenium import webdriver from scrapy import signals class BaiduSpider(scrapy.Spider): name = 'baidu' allowed_domains = ['baidu.com'] start_urls = ['http://www.baidu.com/'] @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(BaiduSpider, cls).from_crawler(crawler, *args, **kwargs) #初始化爬虫对象 spider.driver = webdriver.Chrome() crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)#捕捉信号 return spider def spider_closed(self, spider): spider.driver.close() def parse(self, response): print(response.text) middlewares.py from scrapy.http import HtmlResponse class SeleniumMiddleware: def process_request(self, request, spider): spider.driver.get(request.url) html = spider.driver.page_source return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8') #不会再走后面和下载器 #setting.py DOWNLOADER_MIDDLEWARES = { 'selenium_demo.middlewares.SeleniumMiddleware': 543, }