文章目录
1.基础爬虫
1.1.请求与返回
Req = urllib.request.Request(‘网页地址’)
Response = urllib.request.urlopen(req)
1.2.response对象的方法
(1) read() 读取爬取的所有内容
(2) geturl()得到访问的网址
(3) info()得到HTTPMessage对象
(4) getcode()得到当前状态(200是正常接收)
1.3.获取翻译的python代码示例
1.4.获取图片实例
1.5.IP代理
1.6.url详解
1.7.请求头常见参数
1.8.常见响应状态码
1.9.常见相关函数
1.10.cookie
2.更简单的request库的使用
3.csv文件
3.python连接mysql数据库
4.python与mongoDB
5.python多线程爬虫
D
6.动态网页数据抓取:Selenium + chromedriver获取动态数据
6.1.基本函数
XPath语法:
6.2.操作表单元素
6.3.Cookie操作
6.4.页面等待
6.5.切换页面
6.6.设置代理IP
7.图形验证码识别技术
8.爬电影信息实例
import requests
from lxml import etree
import csv
import time
import os
import pymysql
import threading
from queue import Queue
# 请求头部
header = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Mobile Safari/537.36'}
# 排行版url
url = 'https://movie.douban.com/chart'
# 数据库信息
dbserverID = "localhost"
user = "root"
passord = "19990729"
dbname = "movies"
# 队列的定义
q =Queue()
# 定义图片存储路径
os.mkdir("movies_img")
path = os.getcwd() + os.sep + "movies_img"
def spider():
res = requests.get(url,headers = header)
# 解析网页
selector = etree.HTML(res.text)
movies = selector.xpath('//*[@id="content"]/div/div[1]/div/div/table')
items = [] #存放所有的item信息
for movie in movies:
# 图片url
img_url = movie.xpath('tr/td[1]/a/img/@src')[0]
# 电影名爬取与处理
movie_name = movie.xpath('tr/td[2]/div/a/text()')[0]
movie_name = movie_name.strip().strip('/').strip()
# 电影评分
movie_score = movie.xpath('tr/td[2]/div/div/span[2]/text()')[0]
item = (movie_name,movie_score,img_url)
items.append(item)
return items
def save_csv(items):
with open(os.getcwd()+os.sep+'movie.csv','a',encoding="utf-8-sig",newline='') as csvfile:
write = csv.writer((csvfile))
write.writerow(['电影名','分数','图片url'])
write.writerows(items)
def save_sql(items):
try:
con = pymysql.connect(dbserverID, user, passord, dbname) # 数据库连接
cursor = con.cursor()
sql = 'insert into movie_data(电影名,分数,图片url) values(%s,%s,%s)'
cursor.executemany(sql,items)
con.commit()
except:
con.rollback()
finally:
con.close()
print("关闭数据连接")
class download_img(threading.Thread):
def __init__(self,ID):
threading.Thread.__init__(self)
self.ID = ID
def run(self):
while not q.empty():
print("线程{0}正在下载图片...".format(self.ID))
url = q.get()
# 得到图片名
name = url.split('/')[-1]
with open(os.path.join(path,name),'wb') as f:
f.write(requests.get(url).content)
time.sleep(2)
if __name__ == "__main__":
items = spider()
#save_csv(items)
#save_sql(items)
# 填充队列
for each in items:
q.put(each[2])
d1 = download_img(1)
d2 = download_img(2)
d1.start()
d2.start()
d1.join()
d2.join()