自动化爬取某网站数据并处理后写入数据库
import os.path
import time
import re
import fake_useragent
import pymysql
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge import options
from fontTools.ttLib import TTFont
from datetime import datetime
'''
建表sql
CREATE TABLE `ppCar` (
`id` int(11) NOT NULL AUTO_INCREMENT,
car_id VARCHAR(255),
`car_info` varchar(255) DEFAULT NULL,
`car_date` date DEFAULT NULL,
`car_mile` float DEFAULT NULL,
`car_total_price` float DEFAULT NULL,
`car_payment` float DEFAULT NULL,
`city` varchar(255) DEFAULT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
去除重复数据的sql
CREATE TABLE `secondCar` (
`id` int(11) NOT NULL AUTO_INCREMENT,
car_id VARCHAR(255),
`car_info` varchar(255) DEFAULT NULL,
`car_date` date DEFAULT NULL,
`car_mile` float DEFAULT NULL,
`car_total_price` float DEFAULT NULL,
`car_payment` float DEFAULT NULL,
`city` varchar(255) DEFAULT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into secondCar(car_id,car_info,car_date,car_mile,car_total_price,car_payment,
city,created_at,updated_at) (select p.car_id,p.car_info,p.car_date,p.car_mile,p.car_total_price,p.car_payment,
max(city) city,max(created_at) created_at,max(updated_at) updated_at
from ppCar p group by p.car_id,p.car_info,p.car_date,p.car_mile,p.car_total_price,p.car_payment);
'''
def data_to_mysql(conn, cur, data_list):
insert_sql = ('insert into ppCar(car_id,car_info, car_date, car_mile, car_total_price, car_payment,city) '
'values(%s,%s,%s,%s,%s,%s,%s)')
try:
cur.executemany(insert_sql, data_list)
except Exception as e:
conn.rollback()
else:
conn.commit()
pass
def num_trans():
tf = TTFont("./trans.woff")
num_list = tf.getGlyphOrder()[1:]
num_dict = {"zero": 0, "one": 1, "two": 2,
"three": 3, "four": 4, "five": 5,
"six": 6, "seven": 7, "eight": 8,
"nine": 9}
albnum_list = [num_dict[i] for i in num_list]
return albnum_list
def string_trans(string, albnum_list):
new_string = ''
for i in string:
if i.isdigit():
char = albnum_list.index(int(i))
new_string += str(char)
else:
new_string += i
return new_string
def selenium_scrapy_data(driver, url, albnum_list, city):
driver.get(url)
data_list = []
try:
li_list = driver.find_elements(By.XPATH, "//ul[@class='row-fluid list-row js-car-list']/li")
for li in li_list:
try:
car_id = li.find_element(By.XPATH, "a").get_attribute("data-car-id")
car_info = li.find_element(By.XPATH, "a/h3").text
car_info = string_trans(car_info, albnum_list)
car_year_mile = li.find_element(By.XPATH, "a/div[@class='mileage']/span").text
car_year_mile = string_trans(car_year_mile, albnum_list)
car_date = datetime.strptime(car_year_mile.split("/")[0], '%Y年%m月').date()
car_mile = re.match('([\d.]+)', car_year_mile.split("/")[1]).group(1)
car_price = li.find_element(By.XPATH, "a/div[@class='tags-box']/div").text
car_total_price = re.match('([\d.]+)', car_price).group(1)
try:
car_payment = re.match('([\d.]+).*?首付([\d.]+)', car_price).group(2)
except Exception as e:
car_payment = re.match('([\d.]+)', car_price).group(1)
print(car_info, car_date, car_mile, car_total_price, car_payment)
data_list.append((car_id, car_info, car_date, car_mile, car_total_price, car_payment, city))
except Exception as e:
pass
except Exception as e:
pass
return data_list
def selenium_for_city(driver):
city_code_list = []
driver.get('https://www.renrenche.com/hf/ershouche/')
a_list = driver.find_elements(By.XPATH, "//div[@class='area-city-letter']//a[@class='province-item ']")
for a in a_list:
city_code = a.get_attribute("rrc-event-name")
city_name = a.get_attribute("rrc-event-expand-tag_value")
city_code_list.append(city_code)
print(city_code_list)
woff_url = driver.find_element(By.XPATH, "//style[1]").get_attribute('textContent')
woff_url = re.match(r'[\s\S]*?url\(\'(.*?\.woff)', woff_url).group(1)
print(woff_url)
res_con = requests.get(woff_url, headers={"User-Agent": fake_useragent.UserAgent().random}).content
if os.path.exists("./trans.woff"):
os.remove("./trans.woff")
with open("./trans.woff", "wb") as fp:
fp.write(res_con)
return city_code_list
def main():
conn = pymysql.connect(host="master", user='root', password='123456', port=3306, db='carProject')
cur = conn.cursor()
opt = options.Options()
opt.add_argument("--headless")
driver = webdriver.Edge(options=opt)
city_list = selenium_for_city(driver)
albnum_list = num_trans()
for city in city_list:
for i in range(1, 50):
time.sleep(1)
url = f"https://########(不予显示)/{city}/ershouche/p{i}/"
data_list = selenium_scrapy_data(driver, url, albnum_list, city)
if len(data_list) == 0:
print(url, "此后页面为空")
break
data_to_mysql(conn, cur, data_list)
driver.close()
cur.close()
conn.close()
if __name__ == '__main__':
main()