基础爬虫
简单爬取网页信息,未进行过滤
代码演示
import urllib.request
from fake_useragent import UserAgent
import urllib.parse
UA = UserAgent()
base_url = "https://www.lagou.com/jobs/positionAjax.json?"
# 参数为中文需要转码
se_url = urllib.parse.urlencode({
"city": "成都",
"needAddtionalResult": 'false'
})
url = base_url + se_url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'Connection': 'keep-alive',
'Cookie': 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208729; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208853; _ga=GA1.2.673678807.1545208729; user_trace_token=20181219163959-ab4bffbc-0369-11e9-a1e6-525400f775ce; LGSID=20181219163959-ab4c0263-0369-11e9-a1e6-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Fwd%3Dlagou%26tn%3Dmonline_4_dg%26ie%3Dutf-8; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; LGRID=20181219164202-f4fc640f-0369-11e9-9212-5254005c3644; LGUID=20181219163959-ab4c04da-0369-11e9-a1e6-525400f775ce; _gid=GA1.2.389895687.1545208737; index_location_city=%E5%85%A8%E5%9B%BD'
}
data = {
'first': 'true',
'pn': '2',
'kd': 'python'
}
data = bytes(urllib.parse.urlencode(data), encoding='utf-8')
# 修改请求头
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
with open('lagou.txt', 'w', encoding='utf-8') as f:
f.write(response.read().decode('utf-8'))
简单的反爬
# 导入模块
import urllib.request
from fake_useragent import UserAgent
UA = UserAgent()
url = "https://www.xicidaili.com"
# 打开url地址.默认为get请求方式
headers = {
"User-Agent": f"{UA.random}"
}
# 产生一个随机的user-agent
print(UA.random)
# 修改请求头
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
print(response.read().decode('utf-8'))
使用requests,设置ip代理
代码演示
'''
IP代理
'''
import requests
# url
url = 'https://www.ipip.net/'
# 替换IP使用参数 proxies
proxy = {
'http': 'http://123.161.21.51:9797',
'https': 'http://123.161.21.51:9797'
}
response = requests.get(url, proxies=proxy)
response.encoding = 'utf-8'
with open('Code/ip.html', 'w') as fp:
fp.write(response.text)
print(response.text)
模拟登陆github
代码演示
'''
登陆github
'''
import requests
import re
# 网址
url_token = 'https://github.com/login'
# 维持会的实例化
session = requests.Session()
# 得到token
# 发出请求
response_token = session.get(url_token)
response_token.encoding = 'utf-8'
# token值 findall 返回的是一个列表
token = re.findall(
'name="authenticity_token" value="(.*?)" />', response_token.text)
# 模拟登陆
# 网址
url_login = 'https://github.com/session'
# 填充表单 dict
data = {
'commit': 'Sign+in',
'utf8': '✓',
'authenticity_token': token[0:], # token
'login': '', # 账号名
'password': '', # 密码
}
# 发送请求
# 这种方式是新建了一个会话
#requests.post()
session.post(url_login, data=data)
# 判断登录页面 首页
url = 'https://github.com'
# url_2 = 'https://github.com/jhao104/proxy_pool'
# 请求
response = session.get(url)
with open('Code/github.html', 'w', encoding='utf-8') as fp:
fp.write(response.text)
爬取轻音乐歌曲
代码演示
import requests
import re
import time
# 实现简单的翻页
for page in range(1, 2):
url = f'http://www.htqyy.com/genre/{page}'
response = requests.get(url)
# print(response.text)
songsid = re.findall('checked="checked" value="(.*?)">', response.text)
# print(songsid)
for song in songsid:
song_url = f"http://f2.htqyy.com/play7/{song}/mp3/12"
response = requests.get(song_url)
# print(response)
time.sleep(1)
with open(f"songs/{song}.mp3", 'wb') as f:
f.write(response.content)
代码模拟登陆豆瓣行为
使用selenium动态获取网页的信息
代码演示
from selenium import webdriver
url = "https://www.douban.com/"
# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)
# 使用xpath获取页面的输入框
user = driver.find_element_by_xpath('//*[@id="form_email"]')
password = driver.find_element_by_xpath('//*[@id="form_password"]')
# 输入内容
user.send_keys('') # 账号
password.send_keys(' ') # 密码
log = driver.find_element_by_xpath('/html/body/div[2]/div/div[1]/form/fieldset/div[3]/input')
# 点击
log.click()
爬取网易云音乐评论信息
代码演示
from selenium import webdriver
url = 'https://music.163.com/#/song?id=115675'
# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)
# 切换子页面
driver.switch_to.frame(0)
for v in range(1, 4):
js = 'window.scrollBy(0,8000)'
# 执行js,将页面滑动到最后
driver.execute_script(js)
# xpath获取评论显示框的内容
contents = driver.find_elements_by_xpath('//div[@class="cnt f-brk"]')
for content in contents:
with open('wangyiyun.txt', 'a', encoding='utf-8') as f:
f.write(content.text+'\n')
print(content.text)
# xpath获取下一页的按钮
btn = driver.find_elements_by_link_text('下一页')[0]
btn.click()
# 退出浏览器
driver.quit()