一、urllib
import urllib.request as ur # urllib 是Python中自带的一个库,request是这个库中的一个类
response = ur.urlopen("https://baidu.com") # urlopen是请求百度,赋给响应response
print(response.read().decode("utf-8"))
爬个赶集网招聘信息:
import urllib.request as ur
url = "http://bj.ganji.com/zpbiaoqian/haidian/ "
response = ur.urlopen(url)
print(response.read().decode("utf-8"))
循环分页爬:
解析爬取的信息的时候,分两种情况解析,一种是把解析的名字放一个列表,把职位放一个列表,把薪资放一个列表;一种是把名字和对应的职位以及对应的薪资,这为一个列表。所以,肯定是后者解析方式更好,因为前者的方式,若有某个公司其中一项没有,那就对错位了。所以解析的时候最好是一块一块解析
第一种方式,按条爬,爬的全是职位
import urllib.request as ur
from lxml import etree
for i in range(1,5):
url = "http://bj.ganji.com/zpbiaoqian/haidian/o%d/"%i
response = ur.urlopen(url)
html_str = response.read().decode("utf-8")
html = etree.HTML(html_str) # 把字符串形式的html转换成HTML
names = html.xpath("//a[@class='list-ga gj_tongji js-float']/text() ")
print(names)
break # 加break只爬一页,展示不循环爬
import urllib.request as ur
from lxml import etree
for i in range(1,5): # 循环四次,只爬前四页
url = "http://bj.ganji.com/zpbiaoqian/haidian/o%d/"%i
print("正在爬取第",i,"页",url) # 爬一页打印一下,不然都不知道爬到哪了
response = ur.urlopen(url)
html_str = response.read().decode("utf-8")
html = etree.HTML(html_str) # 把字符串形式的html转换成HTML
names = html.xpath("//a[@class='list-ga gj_tongji js-float']/text() ")
print(names)
第二种方式,按块爬,爬的是一块一块,岗位-工资
所以就需要爬到 HTML里标签dl (是标签a的上一层,上面爬的只有岗位是因为用的a标签)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2019/10/13 0:47
# @Author : Fan
# @File : pa1.py
# @Software: PyCharm
import urllib.request as ur
from lxml import etree
import xlrd
from xlutils.copy import copy
index=0
for i in range(1,5):
url = "http://bj.ganji.com/zpbiaoqian/haidian/o%d/"%i
print("正在爬取第",i,"页",url)
response = ur.urlopen(url)
html_str = response.read().decode("utf-8")
html = etree.HTML(html_str) # 把字符串形式的html转换成HTML
#names = html.xpath("//a[@class='list-ga gj_tongji js-float']/text() ")
dls = html.xpath("//dl[@class='con-list-zcon new-dl']")
work_book = xlrd.open_workbook("ganji.xlsx")
work_book_copy = copy(work_book)
sheet_copy = work_book_copy.get_sheet(0)
for dl in dls:
try:
name = dl.xpath("./dt/a[@class='list-ga gj_tongji js-float']/text()")[0]
company = dl.xpath("./dt/div/a/@title")[0]
salary = dl.xpath("./dd/div[@class='new-dl-salary']/text()")[0]
sheet_copy.write(index, 0,name)
sheet_copy.write(index, 1, company)
sheet_copy.write(index, 2, salary)
work_book_copy.save("ganji.xlsx") #保存
index += 1 #再往下一行存,这里index像一个指针
except:
pass
二、BeautifulSoup
主要用来解析HTML的,有的人偏爱xpath
安装BeautifulSoup
pip install beautifulSoup4
BeautifulSoup库最常见的对象恰好是BeautifulSoup对象
import urllib.request as ur
from bs4 import BeautifulSoup
html = ur.urlopen("http://bj.ganji.com/zpbiaoqian/haidian/o1/")
bsobj = BeautifulSoup(html.read())
print(bsobj.a)
解析一个a标签:
将HTML中所有的a标签都解析出来:
import urllib.request as ur
from bs4 import BeautifulSoup
html = ur.urlopen("http://bj.ganji.com/zpbiaoqian/haidian/o1/")
bsobj = BeautifulSoup(html.read())
print(bsobj.findAll("a"))
虽然结果都解析出来了,但是上面飘红报错是什么原因呢~~
import urllib.request as ur
from bs4 import BeautifulSoup
html = ur.urlopen("http://bj.ganji.com/zpbiaoqian/haidian/o1/")
bsobj = BeautifulSoup(html.read(),“html.parser”) # html.parser相当于解析一个html,加了它就不会飘红了
print(bsobj.findAll("a"))
上图所有的a标签都放在了一个列表里 [ ]
find()和findAll()
find() 和 findAll() 可能是你最常用的两个函数了,通过标签和不同的属性轻松过滤HTML页面,查找需要的标签组或者单个标签
*args 和 ** kwargs —> 一个星 和 两个星 有什么区别?
答:这俩都叫不定长参数;*args叫 元组,**kwargs叫 字典
name是标签名;attrs后面跟的是属性;
import urllib.request as ur
from bs4 import BeautifulSoup
html = ur.urlopen("http://bj.ganji.com/zpbiaoqian/haidian/o1/")
bsobj = BeautifulSoup(html.read(),“html.parser”)
print(bsobj.findAll("a",{"class": "list_title gj_tongji"}))
import urllib.request as ur
from bs4 import BeautifulSoup
html = ur.urlopen("http://bj.ganji.com/zpbiaoqian/haidian/o1/")
bsobj = BeautifulSoup(html.read(),"html.parser")
links = bsobj.findAll("a",{"class": "list_title gj_tongji"})
for link in links:
print(link)
三、导航树
若想通过标签在文档中的的位置来查找标签,就可以使用导航树
bsObj.tag.subTag.anotherSubTag
1.子标签和后代标签处理
子标签就是父标签的下一级,后代标签是父标签下面所有级别的标签
print(bsobj.find("a").children) # 子标签
print(bsobj.find("a").descendants) # 后代标签
四、正则表达式
import re # 用正则表达式首先要引一个库
rec = re.compile("^asd")
print(rec.match("asdfff"))