python & 爬虫

最新推荐文章于 2024-08-06 11:55:39 发布

-无妄-

最新推荐文章于 2024-08-06 11:55:39 发布

阅读量333

点赞数

分类专栏：大数据基础

本文链接：https://blog.csdn.net/bingdianone/article/details/87856738

版权

大数据基础专栏收录该内容

1 篇文章 0 订阅

订阅专栏

文章目录

爬虫简介
- 定义
- 爬虫能做什么
python环境配置与初探
- 安装python2.7
爬虫原理与架构
爬虫实例演示
- “新浪科技”爬虫
爬虫常见问题

爬虫简介

定义

在这里插入图片描述

爬虫能做什么

爬虫爬什么
- 最近热门新闻？
- 公司产品数据
- 个人感兴趣的信息
- …
爬虫的知识储备
- Python / Java / Shell / R …
- HTML
- 高阶知识：深度优先、广度优先、OCR、分布式、Hash、Oracle、Redis…

python环境配置与初探

安装python2.7

可以参考如下链接：
http://www.runoob.com/python/python-install.html

什么是Python：解释型语言、无需手动编译、交互式语言、面向对象语言
Python环境配置
https://anaconda.org/
pandas、numpy 、beautifulsoup4
Scrapy框架
Python初体验

爬虫原理与架构

在这里插入图片描述
网页链接相互之间链接图

爬虫实例演示

“新浪科技”爬虫

首先建立数据库

drop database sina_spider;

CREATE DATABASE sina_spider CHARACTER SET utf8;

USE sina_spider;

CREATE TABLE IF NOT EXISTS sina_spider(
id integer primary key AUTO_INCREMENT,
title text,
url text,
content LONGTEXT
)CHARACTER set utf8 COLLATE utf8_unicode_ci;

在这里插入图片描述
导入常用包

拿到新浪科技网页内容

对网页乱码进行解码
此时会用到beautiful soup (中文文档链接 https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/ )

可以看到拿到了title

拿到了929个链接

保存到数据库

队列大小不断变大
在这里插入图片描述

查看数据库

完整代码

# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup as bs  
import urllib2 # 爬虫常用包
import MySQLdb # 数据库包
from sqlalchemy import create_engine
import Queue
import pandas as pd  # 科学计算常用包


def get_title_links(html): #拿到所有的链接
	try: #有些html网站写的不规范；加上try 保证程序的正常运行
		soup = bs(html, fromEncoding='utf-8')
		title = soup.title.string.encode('utf-8')
		links = soup.find_all('a') #拿到这个网页中所有标签里的url
		linksHandle1 = []
		for l in links: # 判断如果超链接在links中，就把当前超链接存入linksHandle1
			if 'href' in l.attrs and 'http' in l['href']: 
				linksHandle1.append(l['href'])
	except Exception, e:
		print "get_title_links", e
		return None, None
	return title, set(linksHandle1) # 利用set对linksHandle1进行去重


def getLinks(targetURL):
	try:
		req = urllib2.Request(targetURL)# 构造一个request请求
		userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0" 
		req.add_header('User-Agent',userAgent) # 告诉url所在服务器这是个真实的浏览器
		resp = urllib2.urlopen(req, timeout=30).read() #访问这个url网页
		html = resp.decode('utf-8','replace') #用utf-8给乱码网页解码
		title, ilinks = get_title_links(html) #调用get_title_links函数
	except Exception, e:
		print "getLinks", e
		return None, None, None 
	return title, ilinks, html

def saveDB(title, url, html): # 插入到MySQL数据库
	try:
		engine = create_engine("mysql://root:root@127.0.0.1/sina_spider?charset=utf8mb4")
		data = pd.DataFrame([{'title':title,'url':url,'content':html}])# 创建一个dataframe保存数据（相当于一个表）
		data.to_sql("sina_spider", engine, if_exists="append", index=False)#如果这个表已经存在，则继续添加而不需要把index存入数据库中
	except Exception, e:
		print "saveDB error",e

def addQueue(links, urlQueue, visitedURL):
	if links is None:# 如果不为空则返回url队列和已经访问过的url
		return urlQueue, visitedURL
	for i in links: #如果不为空
		if i not in visitedURL and 'sina.com.cn' in i:
			urlQueue.put(i)#如果i不在访问过的url中则放到队尾
			visitedURL.add(i)#同时更新到访问过的set中
	return urlQueue, visitedURL

if __name__ == "__main__":
	targetURL = "http://tech.sina.com.cn/"
	title, allLinks, html = getLinks(targetURL)
	saveDB(title, targetURL, html)
	q = Queue.Queue()# 队列
	visitedURL = set() # 已经访问的url变量set
	visitedURL.add(targetURL)# 将已经访问过的url放入
	q, visitedURL = addQueue(allLinks, q, visitedURL)
	while q.qsize() > 0 and q is not None:
		targetURL = q.get()# 如果队列不为空，拿到第一个元素
		print "queue size: ", q.qsize()
		title, allLinks, html = getLinks(targetURL)
		saveDB(title, targetURL, html)
		q, visitedURL = addQueue(allLinks, q, visitedURL)