Python爬虫常用功能

文件头

import os
from lxml import etree
import requests
import pymysql
import json
import random
if __name__ == '__main__':

创建文件夹

path = 'G:\img'
isExists = os.path.exists(path)
if not isExists:
    os.makedirs(path)

UA伪装

headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}

代理IP

proxy_list = [
    {"http": "171.35.170.254:9999"}
]
proxy = random.choice(proxy_list)
page_text = requests.get(url=url, headers=headers, proxies=proxy)

请求返回json

page_text = requests.get(url=url, headers=headers, proxies=proxy)
print(page_text)
page_text = page_text.json()

xpath解析

tree = etree.HTML(detail_page_text)

dao_yan = tree.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')

下载图片到本地

img_url = tree.xpath('//*[@id="mainpic"]/a/img/@src')[0]
local_img_url = path +'\\'+ img_url.split("/")[-1]
img_code = requests.get(url=img_url,headers=headers).content
with open(local_img_url,'wb') as fp:
       fp.write(img_code)

插入数据库前组装数据必须为元组

data[]
dic = (name,score, dao_yan,bian_ju, zhu_yan, lei_xing, hao_yu_1, hao_yu_2,local_img_url, jian_jie)
data.append(dic)

批量插入数据库

def insert(value):
db = pymysql.connect(host="localhost",user="root",password="123456",database="TESTDB")

cursor = db.cursor()
sql = "INSERT INTO t_douban(name,score, dao_yan,bian_ju, zhu_yan, lei_xing, hao_yu_1, hao_yu_2, img, jian_jie) 			 VALUES (%s,%r, %s,  %s,%s, %s,%s, %s,  %s, %s)"
try:
    cursor.executemany(sql, value)
    db.commit()
    print('插入数据成功')
except Exception as ex:
    db.rollback()
    print("出现如下异常%s:" % ex)
db.close()

下载json到本地

page_text = requests.get(url=url, headers=headers, proxies=proxy)
print(page_text)
page_text = page_text.json()

fp = open('bzhan.json', 'w', encoding='utf-8')
json.dump(page_text, fp, ensure_ascii=False)

下载html到本地

response = requests.get(url=url,headers=header,proxies=proxy)
response.encoding = 'utf-8'
with open('daili.html', 'wb') as fp:
    fp.write(response.content)
  • 6
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值