初始提交:零差云控官网爬虫项目
This commit is contained in:
100
zeroerr_crawler/utils.py
Normal file
100
zeroerr_crawler/utils.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
工具函数模块
|
||||
提供通用的辅助功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from .config import HEADERS
|
||||
|
||||
|
||||
def ensure_dir(path: str) -> None:
|
||||
"""确保目录存在,不存在则创建"""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def get_file_hash(url: str) -> str:
|
||||
"""根据URL生成唯一文件名哈希"""
|
||||
return hashlib.md5(url.encode()).hexdigest()[:12]
|
||||
|
||||
|
||||
def get_file_extension(url: str) -> str:
|
||||
"""从URL获取文件扩展名"""
|
||||
# 移除查询参数
|
||||
clean_url = url.split('?')[0]
|
||||
ext = os.path.splitext(clean_url)[1].lower()
|
||||
if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']:
|
||||
ext = '.jpg' # 默认扩展名
|
||||
return ext
|
||||
|
||||
|
||||
def download_image(img_url: str, save_dir: str, timeout: int = 15) -> str | None:
|
||||
"""
|
||||
下载图片到本地
|
||||
|
||||
Args:
|
||||
img_url: 图片URL
|
||||
save_dir: 保存目录
|
||||
timeout: 超时时间
|
||||
|
||||
Returns:
|
||||
本地文件路径,失败返回 None
|
||||
"""
|
||||
try:
|
||||
ensure_dir(save_dir)
|
||||
|
||||
url_hash = get_file_hash(img_url)
|
||||
ext = get_file_extension(img_url)
|
||||
local_filename = f"{url_hash}{ext}"
|
||||
local_path = os.path.join(save_dir, local_filename)
|
||||
|
||||
# 如果已下载过,直接返回路径
|
||||
if os.path.exists(local_path):
|
||||
return local_path
|
||||
|
||||
# 下载图片
|
||||
response = requests.get(img_url, headers=HEADERS, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
with open(local_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
return local_path
|
||||
else:
|
||||
print(f" 图片下载失败 ({response.status_code}): {img_url}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" 图片下载出错: {img_url} - {e}")
|
||||
return None
|
||||
|
||||
|
||||
def safe_filename(name: str, max_length: int = 50) -> str:
|
||||
"""
|
||||
生成安全的文件名
|
||||
|
||||
Args:
|
||||
name: 原始名称
|
||||
max_length: 最大长度
|
||||
|
||||
Returns:
|
||||
安全的文件名
|
||||
"""
|
||||
# 移除或替换不安全字符
|
||||
unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\r', '\t']
|
||||
for char in unsafe_chars:
|
||||
name = name.replace(char, '_')
|
||||
|
||||
# 去除首尾空格
|
||||
name = name.strip()
|
||||
|
||||
# 截断长度
|
||||
if len(name) > max_length:
|
||||
name = name[:max_length]
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def make_absolute_url(base_url: str, relative_url: str) -> str:
|
||||
"""将相对URL转为绝对URL"""
|
||||
return urljoin(base_url, relative_url)
|
||||
|
||||
Reference in New Issue
Block a user