爬虫代码汇总–记录初入行时写过的爬虫代码
# -*- coding: gbk -*-
import calendar
import csv
import json
import os
import pprint
import random
import re
import time
from urllib.request import urlretrieve
from lxml import etree
import pandas as pd
import parsel
import pymysql
import requests
from selenium import webdriver
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
# base_URL='https://www.umei.cc/bizhitupian/fengjingbizhi/'
def youmei():
root = r'C:\Users\86136\Desktop\output result\图片'
with open(root + '\\' + '数据' + '.csv', 'w', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow(["图片名称", "图片链接"])
for i in range(90):
base_URL = 'https://www.umei.cc/bizhitupian/weimeibizhi/{}.htm'.format(i)
response = requests.get(base_URL, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
# print(html)
parse = parsel.Selector(html)
# print(parse)
href = parse.xpath('//div[@class="TypeList"]/ul/li/a/@href').extract()
# print(href)
for url in href:
# print(url)
urls = requests.get(url, headers=headers).text
imgs = parsel.Selector(urls)
# print(img)
img = imgs.xpath('//div[@class="ImageBody"]/p/a/img/@src').extract_first()
try:
filename = imgs.re(r'<img alt="(.*?)" ')[0].encode('ISO-8859-1').decode('utf-8')
img_data = requests.get(img, headers=headers).content
# print(img_data)
with open(root + '\\' + filename + '.jpg', 'wb')as f:
f.write(img_data)
print(img, filename)
with open(root + '\\' + '数据' + '.csv', 'a', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow([filename, img])
print('如有问题,请联系陶青,15549463230')
except Exception as e:
print("该链接无效,请检查")
def job():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
keyword = input('请输入要爬取的关键字:')
temp = r'C:\Users\86136\Desktop\output result' + '\\' + keyword
with open(temp + '.csv', 'a', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow(["工作名字", "公司名字", "公司地点", "薪资待遇", "发布日期", "职位简介", "公司简介"])
for i in range(100):
url = 'https://search.51job.com/list/180200%252C040000,000000,0000,00,9,99,{0},2,{1}.html'.format(keyword,
1)
response = requests.get(url, headers=headers)
response.encoding = 'gbk'
responsed = response.text
print(responsed)
html = etree.HTML(responsed)
print(html)
work_name = html.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@title')
company_name = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t2"] /a/@title')
company_href = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t2"] /a/@href')
position = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t3"]/text()')
money = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t4"]/text()')
date = html.xpath('//div[@id="resultList"]/div[@class="el"]/span[@class="t5"] /text()')
work_name_hrfe = html.xpath('//div[@id="resultList"]/div[@class="el"]/p/span/a/@href')
for a, b, c, d, e, ff, g in zip(work_name, company_name, position, money, date, work_name_hrfe,
company_href):
print(a, b, c, d, e, ff, g)
with open(temp + '.csv', 'a', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow([a, b, c, d, e, ff, g])
def win400():
root = r'C:\Users\86136\Desktop\output result\图片爬取'
with open(root + '\\' + '数据' + '.csv', 'w', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow(["图片名称", "图片链接"])
# base_URL = 'https://www.umei.cc/bizhitupian/meinvbizhi/{}.htm'.format(i)
for i in range(5):
base_URL = 'http://www.win4000.com/zt/xinggan_{}.html'.format(i)
response = requests.get(base_URL, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
# print(html)
parse = parsel.Selector(html)
# print(parse)
# href = parse.xpath('//div[@class="tab_tj"]//ul[@class="clearfix"]/li/a/img/@data-original').extract() #爬取封面图片
href = parse.xpath('//div[@class="tab_tj"]//ul[@class="clearfix"]/li/a/@href').extract()
# print(href)
for url in href:
# print(url)
try:
urls = requests.get(url, headers=headers).text
imgs = parsel.Selector(urls)
img = imgs.xpath('//div[@class="pic-meinv"]/a/img/@src').extract_first()
title = imgs.xpath('//div[@class="pic-meinv"]/a/img/@title').extract_first()
# print(img,title)
y = os.path.exists(root)
if y == 0:
os.mkdir(root)
else:
pass
filename = title
img_data = requests.get(img, headers=headers).content
# print(img_data)
with open(root + '\\' + filename + '.jpg', 'wb')as f:
f.write(img_data)
print(img, filename)
with open(root + '\\' + '数据' + '.csv', 'a', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow([filename, img])
except Exception as e:
print("该链接无效,请检查{}".format(url))
def guoke():
root = r'C:\Users\86136\Desktop\output result\果壳问答'
y = os.path.exists(root)
if y == 0:
os.mkdir(root)
else:
pass
with open(root + '\\' + '果壳问答' + '.csv', 'w', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow(["问题标题", "问题链接"])
for i in range(1, 3):
base_URL = 'https://www.guokr.com/ask/highlight/?page={}'.format(i)
response = requests.get(base_URL, headers=headers)
# response.encoding = response.apparent_encoding
html = response.text
# print(html)
# *******************正则表达式********************
# pattern =re.compile('<h2><a target="_blank" href="(.*?)">(.*?)</a></h2>')
# list =pattern.findall(html)
# print(list)
# *******************xpath********************
parse = parsel.Selector(html)
# print(parse)
# href=parse.xpath('//ul/li/div[2]/h2/a/@href').extract()
# title = parse.xpath('//ul/li/div[2]/h2/a/text()').extract()
data = parse.xpath('//ul/li/div[2]/h2/a').extract()
for a in data:
# print(a)
try:
href = a.split('"')[3]
title1 = a.split('<')[-2]
title = title1.split('>')[-1]
print(title, href)
filename = title
with open(root + '\\' + '果壳问答' + '.csv', 'a', newline='')as f:
csvwriter = csv.writer(f, dialect='excel')
csvwriter.writerow([filename, href])
except Exception as e:
print("该数据无效,请检查{}".format(href))
def fiction_download():
root = r'C:\Users\86136\Desktop\output result\小说下载'
y = os.path.exists(root)
if y == 0:
os.mkdir(root)
else:
pass
URL = 'http://www.shuquge.com/txt/73234/index.html'
response = requests.get(URL, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
# print(html)
parse = parsel.Selector(html)
# print(parse)
href = parse.css('.listmain dd a::attr(href)').getall()
title = parse.css('.listmain dd a::text').getall()
filename = parse.css('.p a::text').getall()[1]
# print(filename,title,href)
for i in range(12, len(href)):
url = URL.split('index')[0]
base_URL = url + '{}'.format(href[i])
response = requests.get(base_URL, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
# print(html)
# *******************css选择器********************
parse = parsel.Selector(html)
# print(