1、实例代码:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#@Time : 2020/4/7 16:28
#@Author : zhangliangliang
#@File : crawlerDemo3.py
#@Software: PyCharm
from urllib import request
from lxml import etree
import random
import requests,threading,datetime
from bs4 import BeautifulSoup
BASE_URL = "http://www.dytt8.net"
def readFile(path):
content_list = []
with open(path,'r') as f:
for content in f:
content_list.append(content.rstrip())
return content_list
def writeFile(path,text):
with open(path,'a') as f:
f.write(text)
f.write('\n')
def truncateFile(path):
with open(path, 'w', encoding='utf-8') as f:
f.truncate()
def getHeaders():
user_agent_list = readFile("/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/user_agent.txt")
UserAgent = random.choice(user_agent_list)
headers = {
'User-Agent': UserAgent}
print(headers)
return headers
def getIp():
ip_list = readFile('/Users/zll/pycharmProjects/studyPython/crawler_poject_base_part1/config/ip.txt')
#print(ip_list)
ip = random.choice(ip_list)
print(ip)
return ip
def checkip(targeturl,ip):
headers =getHeaders() #定制请求头
proxies = {
"http": "http://"+ip, "https": "https://"+ip} #代理ip
try:
response=requests.get(url=targeturl,proxies=proxies,headers=headers,timeout=5).status_code
if response == 200 :
return True
else:
return False
except:
return False
def getProxies(url):
ip = getIp()
if checkip(url,ip) is True:
proxies = {
'http':'http://'+ip}
print(proxies)
return proxies
else:
return
#print(proxies)
#def getProxies(url):
def get_detail_url(url):
proxies = getProxies(url)
header = getHeaders()
try:
response = requests.get(url, headers=header,proxies=proxies)
#print(response.content.decode('gbk'))