python--爬虫代码汇总

爬虫代码汇总–记录初入行时写过的爬虫代码

# -*- coding: gbk -*-
import calendar
import csv
import json
import os
import pprint
import random
import re
import time
from urllib.request import urlretrieve
from lxml import etree
import pandas as pd
import parsel
import pymysql
import requests
from selenium import webdriver

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}


# base_URL='https://www.umei.cc/bizhitupian/fengjingbizhi/'
def youmei():
    root = r'C:\Users\86136\Desktop\output result\图片'
    with open(root + '\\' + '数据' + '.csv', 'w', newline='')as f:
        csvwriter = csv.writer(f, dialect='excel')
        csvwriter.writerow(["图片名称", "图片链接"])
    for i in range(90):
        base_URL = 'https://www.umei.cc/bizhitupian/weimeibizhi/{}.htm'.format(i)
        response = requests.get(base_URL, headers=headers)
        response.encoding = response.apparent_encoding
        html = response.text
        # print(html)
        parse = parsel.Selector(html)
        # print(parse)
        href = parse.xpath('//div[@class="TypeList"]/ul/li/a/@href').extract()
        # print(href)
        for url in href:
            # print(url)
            urls = requests.get(url, headers=headers).text
            imgs = parsel.Selector(urls)
            # print(img)
            img = imgs.xpath('//div[@class="ImageBody"]/p/a/img/@src').extract_first()
            try:
                filename = imgs.re(r'<img alt="(.*?)" ')[0].encode('ISO-8859-1').decode('utf-8')
                img_data = requests.get(img, headers=headers).content
                # print(img_data)
                with open(root + '\\' + filename + '.jpg', 'wb')as f:
                    f.write(img_data)
                    print(img, filename)
                    with open(root + '\\' + '数据' + '.csv', 'a', newline='')as f:
                        csvwriter = csv.writer(f, dialect='excel')
                        csvwriter.writerow([filename, img])
                        print('如有问题,请联系陶青,15549463230')
            except Exception as e:
                print("该链接无效,请检查")


def job():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    keyword = input('请输入要爬取的关键字:')
    temp = r'C:\Users\86136\Desktop\output result' + '\\' + keyword

    with open(temp + '.csv', 'a', newline='')as f:
        csvwriter = csv.writer(f, dialect='excel')
        csvwriter.writerow(["工作名字", "公司名字", "公司地点", "薪资待遇", "发布日期", "职位简介", "公司简介"])

        for i in range(100):
            url = 'https://search.51job.com/list/180200%252C040000,000000,0000,00,9,99,{0},2,{1}.html'.format(keyword,
                                                                                                              1)
            response = requests.get(url, headers=headers)
            response.encoding = 'gbk'
            responsed = response.text
            print(responsed)
            html = etree.HTML(responsed)
            print(html)
            work_name = html.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@title')
            company_name = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t2"] /a/@title')
            company_href = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t2"] /a/@href')
            position = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t3"]/text()')
            money = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t4"]/text()')
            date = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t5"] /text()')
            work_name_hrfe = html.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@href')
            for a, b, c, d, e, ff, g in zip(work_name, company_name, position, money, date, work_name_hrfe,
                                            company_href):
                print(a, b, c, d, e, ff, g)
                with open(temp + '.csv', 'a', newline='')as f:
                    csvwriter = csv.writer(f, dialect='excel')
                    csvwriter.writerow([a, b, c, d, e, ff, g])

def win400():
    root = r'C:\Users\86136\Desktop\output result\图片爬取'
    with open(root + '\\' + '数据' + '.csv', 'w', newline='')as f:
        csvwriter = csv.writer(f, dialect='excel')
        csvwriter.writerow(["图片名称", "图片链接"])
    # base_URL = 'https://www.umei.cc/bizhitupian/meinvbizhi/{}.htm'.format(i)
    for i in range(5):
        base_URL = 'http://www.win4000.com/zt/xinggan_{}.html'.format(i)
        response = requests.get(base_URL, headers=headers)
        response.encoding = response.apparent_encoding
        html = response.text
        # print(html)
        parse = parsel.Selector(html)
        # print(parse)
        # href = parse.xpath('//div[@class="tab_tj"]//ul[@class="clearfix"]/li/a/img/@data-original').extract()  #爬取封面图片
        href = parse.xpath('//div[@class="tab_tj"]//ul[@class="clearfix"]/li/a/@href').extract()
        # print(href)

        for url in href:
            # print(url)
            try:
                urls = requests.get(url, headers=headers).text
                imgs = parsel.Selector(urls)
                img = imgs.xpath('//div[@class="pic-meinv"]/a/img/@src').extract_first()
                title = imgs.xpath('//div[@class="pic-meinv"]/a/img/@title').extract_first()
                # print(img,title)
                y = os.path.exists(root)
                if y == 0:
                    os.mkdir(root)
                else:
                    pass

                filename = title
                img_data = requests.get(img, headers=headers).content
                # print(img_data)
                with open(root + '\\' + filename + '.jpg', 'wb')as f:
                    f.write(img_data)
                    print(img, filename)
                    with open(root + '\\' + '数据' + '.csv', 'a', newline='')as f:
                        csvwriter = csv.writer(f, dialect='excel')
                        csvwriter.writerow([filename, img])
            except Exception as e:
                print("该链接无效,请检查{}".format(url))


def guoke():
    root = r'C:\Users\86136\Desktop\output result\果壳问答'
    y = os.path.exists(root)
    if y == 0:
        os.mkdir(root)
    else:
        pass
    with open(root + '\\' + '果壳问答' + '.csv', 'w', newline='')as f:
        csvwriter = csv.writer(f, dialect='excel')
        csvwriter.writerow(["问题标题", "问题链接"])
    for i in range(1, 3):
        base_URL = 'https://www.guokr.com/ask/highlight/?page={}'.format(i)
        response = requests.get(base_URL, headers=headers)
        # response.encoding = response.apparent_encoding
        html = response.text
        # print(html)
        # *******************正则表达式********************
        # pattern =re.compile('<h2><a target="_blank" href="(.*?)">(.*?)</a></h2>')
        # list =pattern.findall(html)
        # print(list)
        # *******************xpath********************
        parse = parsel.Selector(html)
        # print(parse)
        # href=parse.xpath('//ul/li/div[2]/h2/a/@href').extract()
        # title = parse.xpath('//ul/li/div[2]/h2/a/text()').extract()
        data = parse.xpath('//ul/li/div[2]/h2/a').extract()
        for a in data:
            # print(a)
            try:
                href = a.split('"')[3]
                title1 = a.split('<')[-2]
                title = title1.split('>')[-1]
                print(title, href)
                filename = title
                with open(root + '\\' + '果壳问答' + '.csv', 'a', newline='')as f:
                    csvwriter = csv.writer(f, dialect='excel')
                    csvwriter.writerow([filename, href])
            except Exception as e:
                print("该数据无效,请检查{}".format(href))


def fiction_download():
    root = r'C:\Users\86136\Desktop\output result\小说下载'
    y = os.path.exists(root)
    if y == 0:
        os.mkdir(root)
    else:
        pass
    URL = 'http://www.shuquge.com/txt/73234/index.html'
    response = requests.get(URL, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    # print(html)
    parse = parsel.Selector(html)
    # print(parse)
    href = parse.css('.listmain dd a::attr(href)').getall()
    title = parse.css('.listmain dd a::text').getall()
    filename = parse.css('.p a::text').getall()[1]
    # print(filename,title,href)
    for i in range(12, len(href)):
        url = URL.split('index')[0]
        base_URL = url + '{}'.format(href[i])
        response = requests.get(base_URL, headers=headers)
        response.encoding = response.apparent_encoding
        html = response.text
        # print(html)
        # *******************css选择器********************
        parse = parsel.Selector(html)
        # print(
  • 7
    点赞
  • 39
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值