基础爬虫

最新推荐文章于 2022-12-23 22:19:21 发布

小白_橙子

最新推荐文章于 2022-12-23 22:19:21 发布

阅读量815

点赞数 1

分类专栏： python 爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_43958804/article/details/85847151

版权

python 同时被 2 个专栏收录

52 篇文章 0 订阅

订阅专栏

爬虫

8 篇文章 0 订阅

订阅专栏

基础爬虫

简单爬取网页信息,未进行过滤

代码演示

import urllib.request
from fake_useragent import UserAgent
import urllib.parse

UA = UserAgent()
base_url = "https://www.lagou.com/jobs/positionAjax.json?"
# 参数为中文需要转码
se_url = urllib.parse.urlencode({
    "city": "成都",
    "needAddtionalResult": 'false'
})
url = base_url + se_url
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
    'Connection': 'keep-alive',
    'Cookie': 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208729; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545208853; _ga=GA1.2.673678807.1545208729; user_trace_token=20181219163959-ab4bffbc-0369-11e9-a1e6-525400f775ce; LGSID=20181219163959-ab4c0263-0369-11e9-a1e6-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Fwd%3Dlagou%26tn%3Dmonline_4_dg%26ie%3Dutf-8; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; LGRID=20181219164202-f4fc640f-0369-11e9-9212-5254005c3644; LGUID=20181219163959-ab4c04da-0369-11e9-a1e6-525400f775ce; _gid=GA1.2.389895687.1545208737; index_location_city=%E5%85%A8%E5%9B%BD'

}
data = {
    'first': 'true',
    'pn': '2',
    'kd': 'python'
}
data = bytes(urllib.parse.urlencode(data), encoding='utf-8')
# 修改请求头
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
with open('lagou.txt', 'w', encoding='utf-8') as f:
    f.write(response.read().decode('utf-8'))

简单的反爬

# 导入模块
import urllib.request
from fake_useragent import UserAgent

UA = UserAgent()
url = "https://www.xicidaili.com"
# 打开url地址.默认为get请求方式
headers = {
    "User-Agent": f"{UA.random}"
}
# 产生一个随机的user-agent
print(UA.random)
# 修改请求头
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# 打印获取的源代码
print(response.read().decode('utf-8'))

使用requests,设置ip代理

代码演示

'''
IP代理
'''
import requests

# url
url = 'https://www.ipip.net/'

# 替换IP使用参数 proxies
proxy = {
    'http': 'http://123.161.21.51:9797',
    'https': 'http://123.161.21.51:9797'

}

response = requests.get(url, proxies=proxy)

response.encoding = 'utf-8'

with open('Code/ip.html', 'w') as fp:
    fp.write(response.text)

print(response.text)

模拟登陆github

代码演示

'''
登陆github
'''
import requests
import re

# 网址
url_token = 'https://github.com/login'

# 维持会的实例化
session = requests.Session()

# 得到token
# 发出请求
response_token = session.get(url_token)
response_token.encoding = 'utf-8'
# token值  findall 返回的是一个列表
token = re.findall(
    'name="authenticity_token" value="(.*?)" />', response_token.text)

# 模拟登陆

# 网址
url_login = 'https://github.com/session'

# 填充表单 dict
data = {
    'commit': 'Sign+in',
    'utf8': '✓',
    'authenticity_token': token[0:], # token
    'login': '', # 账号名
    'password': '', # 密码
}

# 发送请求
# 这种方式是新建了一个会话
#requests.post()
session.post(url_login, data=data)

# 判断登录页面 首页
url = 'https://github.com'

# url_2 = 'https://github.com/jhao104/proxy_pool'

# 请求
response = session.get(url)

with open('Code/github.html', 'w', encoding='utf-8') as fp:
    fp.write(response.text)

爬取轻音乐歌曲

代码演示

import requests
import re
import time

# 实现简单的翻页
for page in range(1, 2):
    url = f'http://www.htqyy.com/genre/{page}'
    response = requests.get(url)
    # print(response.text)
    songsid = re.findall('checked="checked" value="(.*?)">', response.text)
    # print(songsid)
    for song in songsid:
        song_url = f"http://f2.htqyy.com/play7/{song}/mp3/12"
        response = requests.get(song_url)
        # print(response)
        time.sleep(1)
        with open(f"songs/{song}.mp3", 'wb') as f:
            f.write(response.content)

代码模拟登陆豆瓣行为

使用selenium动态获取网页的信息

代码演示

from selenium import webdriver

url = "https://www.douban.com/"

# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)

# 使用xpath获取页面的输入框
user = driver.find_element_by_xpath('//*[@id="form_email"]')
password = driver.find_element_by_xpath('//*[@id="form_password"]')

# 输入内容
user.send_keys('')  # 账号
password.send_keys(' ')  # 密码

log = driver.find_element_by_xpath('/html/body/div[2]/div/div[1]/form/fieldset/div[3]/input')
# 点击
log.click()

爬取网易云音乐评论信息

代码演示

from selenium import webdriver

url = 'https://music.163.com/#/song?id=115675'
# 申请一个浏览器,开启浏览器
driver = webdriver.Firefox()
driver.get(url)
# 切换子页面
driver.switch_to.frame(0)
for v in range(1, 4):
    js = 'window.scrollBy(0,8000)'
    # 执行js,将页面滑动到最后
    driver.execute_script(js)
    # xpath获取评论显示框的内容
    contents = driver.find_elements_by_xpath('//div[@class="cnt f-brk"]')
    for content in contents:
        with open('wangyiyun.txt', 'a', encoding='utf-8') as f:
            f.write(content.text+'\n')
        print(content.text)
        # xpath获取下一页的按钮
    btn = driver.find_elements_by_link_text('下一页')[0]
    btn.click()
# 退出浏览器
driver.quit()

小白_橙子

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
基础爬虫

基础爬虫简单爬取网页信息,未进行过滤代码演示import urllib.requestfrom fake_useragent import UserAgentimport urllib.parseUA = UserAgent()base_url = "https://www.lagou.com/jobs/positionAjax.json?"# 参数为中文需要转码se_url ...
复制链接

扫一扫

专栏目录