最近在学爬虫,把学到的代码总结了下,建议自己复制粘贴运行一下
使用第三方库 requests
1.判断访问是否成功
import requests
r = requests.get("http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NASDAQ&render=download")
r.status_code #返回状态
#200 200成功
r.encoding = "utf - 8 " # 编码方式
r.text #字符串形式
r.content #还原二进制图片
2.爬取网页通用代码框架
import requests
def getHMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status() #判定状态是不是200
r.encoding = r.apparent_encoding
return r.text
except:
return"产生异常"
if __name__ == "__main__":
url = "http://www.baidu.com"
print(getHMLText(url))
#url格式 http://host合法的Intrnet主机域名或IP地址[:port]端口号,缺省端口为80[path]请求资源的路径
#URL是通过HTTP协议存取资源的Internet路径,一个URL对用一个数据资源
3.爬取某商品代码
import requests
url = "http://item.jd.com/2967929.html"
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
4.请求头修改实例
import requests
kv = {'user-agent':'Mzilla/5.0'}
url = "https://www.amazon.cn/gp/product/B01MBL5Z3Y"
r = requests.get(url,headers = kv)#headers添加到这里
print(r.status_code)#503访问出现错误
print(r.encoding)
r.encoding =r.apparent_encoding
print(r.text[1000:2000])
print(r.request.headers)#python request请求头{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
#kv = {'user-agent':'Mzilla/5.0'}转移到开头
5.网络图片爬取实例
import requests
path = "D://personal//图片//abc.jpg"#改成自己的
url = "http://pic1.sc.chinaz.com/Files/pic/pic9/202004/hpic2333_s.jpg"
r = requests.get(url)
print(r.status_code)
with open(path,'wb') as f:
f.write(r.content)
f.close()
通用框架
import requests
import os
url = ""
root = "D://pics"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
6.beautifulsoup使用实例
import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
soup = BeautifulSoup(demo,"html.parser")
print(soup.title)
tag = soup.a #链接标签
print(tag)
print(soup.a.name) #获取a标签的名字
print(soup.a.parent.name) #获取a标签的父亲的名字
print(soup.a.parent.parent.name)
print(tag.attrs) #查看标签的属性
print(tag.attrs['class']) #获取键值对对应的值
print(tag.attrs['href']) #对应链接
print(type(tag.attrs)) #查看标签对应的类型
print(soup.a.string) #对应的信息
#soup.contents子节点的所有列表 .children与上一个类似子节点的迭代类型,循环遍历 .descendants子孙节点迭代类型,循环遍历
print(soup.head.contents)
print(soup.body.contents)
print(len(soup.body.contents)) #个数
print(soup.body.contents[1]) #第一个节点
print(soup.title.parent)
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
#平行遍历必须发生在同一个父亲节点下
print(soup.a.next_sibling)#下一个标签 不一定是标签类型
print(soup.a.next_sibling.next_sibling) #下一个标签的下一个标签
print(soup.a.previous_sibling) #上一个节点 #可利用for i inrange循环遍历
for sibling in soup.a.previous_siblings:
print(sibling)