爬虫作业前六次

不想上班只想要钱

已于 2022-05-28 21:28:18 修改

阅读量418

点赞数 1

分类专栏： Python 文章标签：爬虫 python 开发语言

于 2022-05-20 14:14:02 首次发布

本文链接：https://blog.csdn.net/Aa12364567/article/details/124882429

版权

Python 专栏收录该内容

14 篇文章 8 订阅

订阅专栏

import requests
import json

number = int(input("请输入翻译方式（填写序号即可）：1.英译汉2.汉译英"))
text = input('请您输入要翻译的内容：')
url = 'http://fanyi.so.com/index/search'
headers = {
    'pro': 'fanyi',
    'user-agent': '自己的UA'
}
data = {
    'eng': number,
    'validate': '',
    'ignore_trans': '0',
    'query': text
}

res = requests.post(url, headers=headers, data=data)
res_json = json.loads(res.text)

print("您翻译过来的内容是：", res_json['data']['fanyi'])

import requests
import csv
from bs4 import BeautifulSoup
data_list=[]
url="https://sc.chinaz.com/tupian/"
headers={
    'User-Agent': '自己的UA'
}
response=requests.get(url,headers=headers)
html = response.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
div_tag=soup.find("div",id="container").select('a')[1::3]
print(div_tag)
for div in div_tag:
    item={}
    item["图片名称"]=div.get("alt")
    item["图片链接"]="https:"+div.get("href")
    data_list.append(item)



with open("图片.csv", 'w', encoding='utf-8', newline='') as f:
    wt = csv.DictWriter(f, ["图片名称", "图片链接"])
    wt.writeheader()
    wt.writerows(data_list)

import requests
import csv
from lxml import html

etree = html.etree
data_list = []
url = "https://www.9ku.com/music/t_new.htm"
headers = {
    'User-Agent': '自己的UA'
}
response = requests.get(url, headers=headers)
html = etree.HTML(response.content.decode('utf-8'))
# print(etree.tounicode(html))
li_tag_list = html.xpath('//div[@class="songList clearfix"]/ol')
for li_tag in li_tag_list:
    # print(etree.tounicode(li_tag))
    # print(li_tag.xpath('.//a/text()'))#歌名
    pre = "https://www.9ku.com"
    # print(li_tag.xpath('.//a/@href'))#链接
    listname = li_tag.xpath('.//a/text()')
    listurl = li_tag.xpath('.//a/@href')
    item = {}
    for (i, j) in zip(listname, listurl):
        item["歌名"] = i
        item["链接"] = pre + j
        # print(i,j)
        data_list.append(item)
    # break
    # item = {}
    # item["歌名"] = li_tag.xpath('.//a/text()')[0]
    # item["歌名"] =li_tag.xpath('.//a/@href')[0]
    # data_list.append(item)

with open('jiukumusic.csv', 'w', encoding='utf-8', newline='') as f:
    wt = csv.DictWriter(f, ["歌名", "链接"])
    wt.writeheader()
    wt.writerows(data_list)

import csv

import requests
import re
import json

data_list = []
url = "https://top.baidu.com/board?tab=movie"
headers = {
    'User-Agent': '自己的UA'
}
response = requests.get(url, headers=headers)
html = response.content.decode("utf-8")

result = re.findall(
    '<div class="c-single-text-ellipsis"> (.*?)</div>.*?<div class="intro_1l0wp"> 类型：(.*?) </div><div class="intro_1l0wp"> 演员：(.*?) </div>',
    html, re.S)
print(result)
for data in result:
    # 定义一个字典用来保存数据的
    item = {}
    # print(data)
    item["moviename"] = data[0]
    item["moviestyle"] = data[1]
    item["personname"] = data[2]
    # print(item)
    data_list.append(item)

print(data_list)

with open("movie.csv","w",encoding="utf-8",newline="") as f:
    wt = csv.DictWriter(f, ['moviename','moviestyle','personname'])
    # 写入表头
    wt.writeheader()
    wt.writerows(data_list)

import requests
import csv
from bs4 import BeautifulSoup
data_list=[]
url="https://sc.chinaz.com/yinxiao/"
headers={
    'User-Agent': '自己的UA'
}
response=requests.get(url,headers=headers)
html = response.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
div_tag=soup.find_all("div",class_="container")[3]
# print(div_tag)
for div in div_tag.find_all("div",class_="right-head"):
    div_list=div.find('a')
    item={}
    pre="https://sc.chinaz.com"
    item["音乐名称"]=div_list.find('p').string.strip()
    item["音乐链接"]=pre+div_list.get("href")
    # print(div_list.get("href"))
    # print(div_list.find('p').string.strip())
    # print("*"*50)
    data_list.append(item)

with open("音乐.csv", 'w', encoding='utf-8', newline='') as f:
    wt = csv.DictWriter(f, ["音乐名称", "音乐链接"])
    wt.writeheader()
    wt.writerows(data_list)

import csv

import requests
import re
import json

data_list = []
url = "https://music.163.com/discover/toplist"
headers = {
    'user-agent': '自己的UA'
}
response = requests.get(url, headers=headers)
html = response.content.decode("utf-8")
# print(html)
# result = re.findall('<a href="/song\?id=(.*?)">(.*?)</a>', html, re.S)
result = re.findall('<textarea id="song-list-pre-data" style="display:none;">(.*?)</textarea>', html, re.S)
# result=re.findall('"artists":\[\{"id":5773,"name":"谢霆锋","tns":\[],"alias":\[]}\],"alias":\[],"status":0,"ftype":0,"score":100.0,"copyrightId":2706568,"mvid":14490635,"transNames":null,"no":1,"publishTime":973008000000,"commentThreadId":"R_SO_4_168905","name":"活着Viva","id":168905,"type":0}',html,re.S)
res = json.loads(result[0])
# print(res[0]['artists'][0]['name']) #歌手
# print(res[0]['album']['name'])  #歌名
# for i in result:
#     print(i)
# print(res[10]['artists'][0]["name"]) #歌手
# a = ""
# item = res[10]['artists']
# print(res[10]['artists'])  # 歌名
# for i in res[10]['artists']:
#     a = a + i["name"] + "\\"
# print(a)
# print()
# print(res[1]['artists'])  # 歌名
for data in res:
    name = ""
    num = 0
    # print(data['artists'][0]['name'])
    item = {}
    for i in data['artists']:
        name += i["name"] + "\\"
        num += 1
    if (num == 1):
        name = data['artists'][0]['name']
    item["personname"] = name
    item["songname"] = data['name']
    data_list.append(item)

print(data_list)

with open("wyy.csv", 'w', encoding='utf-8', newline='') as f:
    wt = csv.DictWriter(f, fieldnames=['personname', "songname"])
    wt.writeheader()
    wt.writerows(data_list)

from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time
import random

data_list = []
option = webdriver.ChromeOptions()
# 去掉识别
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option("detach", True)
# 去掉window.navigator.webdriver的特性
option.add_argument("disable-blink-features=AutomationControlled")
# 设置无头
# option.add_argument('--headless')
driver = webdriver.Chrome(options=option)
driver.get('https://www.51job.com/')
driver.find_element(By.XPATH, '/html/body/div[3]/div/div[1]/div/a').click()
time.sleep(random.random() * 5)
driver.find_element(By.ID, "kwdselectid").send_keys("python")
time.sleep(random.random() * 5)
driver.find_element(By.ID, 'work_position_input').click()
time.sleep(random.random() * 5)
driver.find_element(By.ID, "work_position_click_center_right_list_category_000000_030200").click()
time.sleep(random.random() * 5)
driver.find_element(By.ID, 'work_position_click_bottom_save').click()
time.sleep(random.random() * 5)
driver.execute_script('arguments[0].click()', driver.find_element(By.ID, 'funtype_click'))
time.sleep(random.random() * 5)
driver.find_element(By.ID, 'funtype_click_center_right_list_category_0100_0100').click()
time.sleep(random.random() * 5)
driver.find_element(By.ID, 'funtype_click_center_right_list_sub_category_each_0100_0124').click()
time.sleep(random.random() * 5)
driver.find_element(By.ID, 'funtype_click_bottom_save').click()
time.sleep(random.random() * 5)
driver.find_element(By.XPATH, '//*[@id="workyear_list"]/span').click()
time.sleep(random.random() * 5)
driver.find_element(By.XPATH, '//*[@id="workyear_list"]/div/span[3]').click()
time.sleep(random.random() * 5)
driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[10]/span').click()
time.sleep(5)
while True:
    for span in driver.find_elements(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div/a/p[1]/span[1]'):
        data_list.append(dict(name=span.text))
    try:
        driver.find_element(By.XPATH, '//li[@class="next"]/a').click()
        time.sleep(random.random() * 5)
    except:
        break



with open('zhaopin.csv', 'w', encoding='utf-8', newline='') as f:
    wt = csv.DictWriter(f, ['name'])
    wt.writeheader()
    wt.writerows(data_list)

'''
https://sc.chinaz.com/tupian/
https://sc.chinaz.com/tupian/index_2.html
https://sc.chinaz.com/tupian/index_3.html
'''

import requests
from lxml import html
import hashlib
import urllib.request

etree = html.etree
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}

from queue import Queue
import threading
import time


def producer(url_q, img_q, img_name):
    headers = {
        'User-Agent': '自己的UA'
    }
    while True:
        if url_q.empty():
            break
        response = requests.get(url_q.get(), headers=headers)
        html = etree.HTML(response.content)
        for div in html.xpath('//*[@id="container"]//div/div'):
            img_url = div.xpath('./a/img/@src2')[0]
            img_mingcheng = div.xpath('./a/img/@alt')[0]
            print(type(img_url))
            print("img_url： " + img_url)
            img_q.put(img_url)
            img_name.put(img_mingcheng)


def consumer(img_q, img_name):
    while True:
        if img_q.empty():
            break
        img_url ="https:" +img_q.get()
        img_mingcheng = img_name.get()
        name = 'image/' + img_mingcheng + '.' + img_url.rsplit('.', 1)[-1]
        print("name:" + name)
        urllib.request.urlretrieve(img_url, name)


if __name__ == '__main__':
    url_q = Queue()
    img_q = Queue()
    img_name = Queue()
    for i in range(5):
        url = f'https://sc.chinaz.com/tupian/index_{i + 1}.html'
        if i == 0:
            url = 'https://sc.chinaz.com/tupian/'
        url_q.put(url)
    # 生产者
    p_list = []
    for i in range(1):
        t = threading.Thread(target=producer, args=(url_q, img_q,img_name))
        t.start()
        p_list.append(t)
    for t in p_list:
        t.join()
    print('已经生产完成了')
    # 消费者
    c_list = []
    for i in range(1):
        t = threading.Thread(target=consumer, args=(img_q,img_name))
        t.start()
        c_list.append(t)
    for t in c_list:
        t.join()

不想上班只想要钱

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
爬虫作业前六次

import requestsimport jsonnumber = int(input("请输入翻译方式（填写序号即可）：1.英译汉2.汉译英"))text = input('请您输入要翻译的内容：')url = 'http://fanyi.so.com/index/search'headers = { 'pro': 'fanyi', 'user-agent': '自己的UA'}data = { 'eng': number, 'validate': '',
复制链接

扫一扫

专栏目录