基础爬虫

基础爬虫

简单爬取网页信息,未进行过滤
代码演示
import urllib.request
from fake_useragent import UserAgent
import urllib.parse

UA = UserAgent()
base_url = "https://www.lagou.com/jobs/positionAjax.json?"
# 参数为中文需要转码
se_url = urllib.parse.urlencode({
    "city": "成都",
    "needAddtionalResult": 'false'
})
url = base_url + se_url
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
    'Connection': 'keep-alive',
    'Cookie': 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208729; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208853; _ga=GA1.2.673678807.1545208729; user_trace_token=20181219163959-ab4bffbc-0369-11e9-a1e6-525400f775ce; LGSID=20181219163959-ab4c0263-0369-11e9-a1e6-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Fwd%3Dlagou%26tn%3Dmonline_4_dg%26ie%3Dutf-8; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; LGRID=20181219164202-f4fc640f-0369-11e9-9212-5254005c3644; LGUID=20181219163959-ab4c04da-0369-11e9-a1e6-525400f775ce; _gid=GA1.2.389895687.1545208737; index_location_city=%E5%85%A8%E5%9B%BD'

}
data = {
    'first': 'true',
    'pn': '2',
    'kd': 'python'
}
data = bytes(urllib.parse.urlencode(data), encoding='utf-8')
# 修改请求头
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
with open('lagou.txt', 'w', encoding='utf-8') as f:
    f.write(response.read().decode('utf-8'))
简单的反爬
# 导入模块
import urllib.request
from fake_useragent import UserAgent

UA = UserAgent()
url = "https://www.xicidaili.com"
# 打开url地址.默认为get请求方式
headers = {
    "User-Agent": f"{UA.random}"
}
# 产生一个随机的user-agent
print(UA.random)
# 修改请求头
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
print(response.read().decode('utf-8'))
使用requests,设置ip代理
代码演示
'''
IP代理
'''
import requests

# url
url = 'https://www.ipip.net/'

# 替换IP使用参数 proxies
proxy = {
    'http': 'http://123.161.21.51:9797',
    'https': 'http://123.161.21.51:9797'

}

response = requests.get(url, proxies=proxy)

response.encoding = 'utf-8'

with open('Code/ip.html', 'w') as fp:
    fp.write(response.text)

print(response.text)
模拟登陆github
代码演示
'''
登陆github
'''
import requests
import re

# 网址
url_token = 'https://github.com/login'

# 维持会的实例化
session = requests.Session()

# 得到token
# 发出请求
response_token = session.get(url_token)
response_token.encoding = 'utf-8'
# token值  findall 返回的是一个列表
token = re.findall(
    'name="authenticity_token" value="(.*?)" />', response_token.text)

# 模拟登陆

# 网址
url_login = 'https://github.com/session'

# 填充表单 dict
data = {
    'commit': 'Sign+in',
    'utf8': '✓',
    'authenticity_token': token[0:], # token
    'login': '', # 账号名
    'password': '', # 密码
}

# 发送请求
# 这种方式是新建了一个会话
#requests.post()
session.post(url_login, data=data)

# 判断登录页面 首页
url = 'https://github.com'

# url_2 = 'https://github.com/jhao104/proxy_pool'

# 请求
response = session.get(url)

with open('Code/github.html', 'w', encoding='utf-8') as fp:
    fp.write(response.text)
爬取轻音乐歌曲
代码演示
import requests
import re
import time

# 实现简单的翻页
for page in range(1, 2):
    url = f'http://www.htqyy.com/genre/{page}'
    response = requests.get(url)
    # print(response.text)
    songsid = re.findall('checked="checked" value="(.*?)">', response.text)
    # print(songsid)
    for song in songsid:
        song_url = f"http://f2.htqyy.com/play7/{song}/mp3/12"
        response = requests.get(song_url)
        # print(response)
        time.sleep(1)
        with open(f"songs/{song}.mp3", 'wb') as f:
            f.write(response.content)

代码模拟登陆豆瓣行为

使用selenium动态获取网页的信息

代码演示
from selenium import webdriver

url = "https://www.douban.com/"

# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)

# 使用xpath获取页面的输入框
user = driver.find_element_by_xpath('//*[@id="form_email"]')
password = driver.find_element_by_xpath('//*[@id="form_password"]')

# 输入内容
user.send_keys('')  # 账号
password.send_keys(' ')  # 密码

log = driver.find_element_by_xpath('/html/body/div[2]/div/div[1]/form/fieldset/div[3]/input')
# 点击
log.click()
爬取网易云音乐评论信息
代码演示
from selenium import webdriver

url = 'https://music.163.com/#/song?id=115675'
# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)
# 切换子页面
driver.switch_to.frame(0)
for v in range(1, 4):
    js = 'window.scrollBy(0,8000)'
    # 执行js,将页面滑动到最后
    driver.execute_script(js)
    # xpath获取评论显示框的内容
    contents = driver.find_elements_by_xpath('//div[@class="cnt f-brk"]')
    for content in contents:
        with open('wangyiyun.txt', 'a', encoding='utf-8') as f:
            f.write(content.text+'\n')
        print(content.text)
        # xpath获取下一页的按钮
    btn = driver.find_elements_by_link_text('下一页')[0]
    btn.click()
# 退出浏览器
driver.quit()
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值