import time
import requests
from bs4 import BeautifulSoup as bs
from lxml import etree
def start(num):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# html = requests.get('http://www.8749.cn/c4/', headers=headers)
html=requests.get('http://www.8749.cn/c4/list{}.html'.format(num), headers=headers)
time.sleep(2)
# response = bs(html.text, 'html.parser')
response=etree.HTML(html.text)
parse(response)
# fanye(response)
def parse(response):
# ahref = 'http://www.8749.cn/114/list{}.html'.format(num)
# requests.get(ahref)
# items = response.select(".pagelist li")
items=response.xpath('//*[@class="pagelist"]/li')
if items:
for item in items:
try:
# title = item.select_one('.company a').text
title=item.xpath('//*[@class="company"]/a')[0].text
except Exception as e :
print(e)
continue
# link = item.select_one('.company a').get("href")
link=item.xpath('//*[@class="company"]/a/@href')[0]
# area = item.select_one('.px12 a').text
area=item.xpath('//*[@class="list px12"]/a')[0].text
# tel = item.select_one('.contactbtn a').get("href")
tel=item.xpath('//*[@class="contactbtn"]/a/@href')[0]
name, tel_phone_ = url(tel)
print(title, link, area, name, tel_phone_)
def url(tel):
tel_phone_ = None
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
url2 = requests.get(tel, headers=headers)
time.sleep(2)
# response = bs(url2.text, 'html.parser')
response=etree.HTML(url2.text)
# name = response.select_one(' tr:nth-child(1) td:nth-child(2)').text.replace(u'\xa0', u'')
name=response.xpath('//td[text()="联系人:"]/following-sibling::td/text()')[0]
# tel_phone = response.select_one(' tr:nth-child(3) td:nth-child(2)').text.replace(u'\xa0', u'')
tel_phone = response.xpath('//td[text()="手 机:"]/following-sibling::td/text()')[0].strip() #用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
if tel_phone.isdigit():
tel_phone_=tel_phone
else:
pass
return name,tel_phone_
# def fanye(num):
# for i in range(1, 10):
# print(parse(i))
if __name__ == '__main__':
for i in range(1, 10):
print(start(i))
css定位 和 xpath定位的两种方式
完整版
import threading
import time
from bs4 import BeautifulSoup as bs
import re
import requests
from pub_config import settings
import pymysql
class HuangYe():
def __init__(self, info):
self.url = "http://www.8749.cn/"
self.name = info["name"]
self.joint = info["value"]
self.db = pymysql.connect(host='1', user='1', password='1', database='1')
self.cursor = self.db.cursor()
def run(self):
url = self.url + self.joint + 'list{}.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
html = requests.get(url.format(1), headers=headers, timeout=5)
response = bs(html.text, 'html.parser')
self.parse(response)
# 获取最大页数
maxnums = response.select('#pageLink a')
maxnum = int(maxnums[-2].text)
self.start(2, maxnum, url)
def start(self, num, maxnum, url):
while num < maxnum+1:
print("下一页-----------------------", num)
time.sleep(2)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
html = requests.get(url.format(num),headers=headers, timeout=5)
response = bs(html.text, 'html.parser')
self.parse(response)
num += 1
def parse(self,response):
items = response.select(".pagelist li")
if items:
for item in items:
title_link = item.select_one('.company a')
if title_link:
title = title_link.text
link = title_link.get("href")
else:
title = ""
link = ""
# sql 根据公司名字和链接查询是否存在
sql_1 = 'select * from hy_companys where `url`=%s and company=%s'
self.cursor.execute(sql_1, (link, title))
res = self.cursor.fetchone()
if res:
print('已存在',res)
continue
products = item.select_one('.product')
if products:
products = products.text.replace("主营产品:", "").replace(" ", "").replace("\n", "")
else:
products = ""
area = item.select_one('.px12 a').text
tel = item.select_one('.contactbtn a').get("href")
name, tel_phone_ = self.url1(tel)
# sql 根据公司名字和电话查询是否存在
sql_2 = 'select * from hy_companys where `tel`=%s and company=%s'
self.cursor.execute(sql_2, (tel, title))
res_1 = self.cursor.fetchone()
if res_1:
print('已存在',res_1)
continue
# 若不存在,则插入数据库
try:
sql = 'insert into hy_companys(`company`,`area`, `classify`, `url`,`name`,`tel`,`origin_url`,`main_products`)values (%s, %s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql, (title, area, self.name, link, name, tel_phone_, self.url,products))
self.db.commit()
print(title, link, area, name, tel_phone_)
except Exception as e:
print(e)
self.db.rollback()
# print(title, link, area, name, tel_phone_)
def url1(self,tel):
tel_phone_ = ""
time.sleep(2)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
url2 = requests.get(tel, headers=headers, timeout=5)
response = bs(url2.text, 'html.parser')
name = response.select_one(' tr:nth-child(1) td:nth-child(2)').text.replace(u'\xa0', u'')
tel_phone = response.select_one(' tr:nth-child(3) td:nth-child(2)').text.replace(u'\xa0', u'')
if tel_phone.isdigit():
tel_phone_ = tel_phone
else:
pass
return name, tel_phone_
if __name__ == '__main__':
dicts = [
{"name": "机械", "value": "c4/"},
{"name": "五金", "value": "c5/"},
{"name": "服装", "value": "c6/"},
{"name": "服饰", "value": "c7/"},
{"name": "化工", "value": "c9/"},
{"name": "橡塑", "value": "c10/"},
{"name": "商务服务", "value": "c11/"},
{"name": "印刷", "value": "c12/"},
{"name": "纸业", "value": "c13/"},
{"name": "汽摩及配件", "value": "c15/"},
{"name": "家用电器", "value": "c16/"},
{"name": "安全、防护", "value": "c17/"},
{"name": "能源", "value": "c18/"},
{"name": "交通运输", "value": "c19/"},
{"name": "照明工业", "value": "c21/"},
{"name": "仪器、仪表", "value": "c22/"},
{"name": "电子元器件", "value": "c23/"},
{"name": "电工电气", "value": "c24/"},
{"name": "数码、电脑", "value": "c25/"},
{"name": "礼品饰品", "value": "c26/"},
{"name": "建筑建材", "value": "c27/"},
{"name": "家居用品", "value": "c28/"},
{"name": "纺织、皮革", "value": "c29/"},
{"name": "包装", "value": "c30/"},
{"name": "办公、文教", "value": "c31/"},
{"name": "食品、饮料", "value": "c32/"},
{"name": "农业", "value": "c33/"},
{"name": "冶金矿产", "value": "c34/"},
{"name": "运动、休闲", "value": "c35/"},
{"name": "手机通讯", "value": "c36/"},
{"name": "玩具", "value": "c37/"},
{"name": "环保", "value": "c38/"},
{"name": "传媒、广电", "value": "c39/"},
]
for info in dicts:
print(info)
huang_ye = HuangYe(info)
huang_ye.run()