初始提交:零差云控官网爬虫项目

This commit is contained in:
oy2020
2026-01-29 17:29:14 +08:00
commit 51b67b9e68
406 changed files with 14247 additions and 0 deletions

100
zeroerr_crawler/utils.py Normal file
View File

@@ -0,0 +1,100 @@
"""
工具函数模块
提供通用的辅助功能
"""
import os
import hashlib
import requests
from urllib.parse import urljoin
from .config import HEADERS
def ensure_dir(path: str) -> None:
"""确保目录存在,不存在则创建"""
os.makedirs(path, exist_ok=True)
def get_file_hash(url: str) -> str:
"""根据URL生成唯一文件名哈希"""
return hashlib.md5(url.encode()).hexdigest()[:12]
def get_file_extension(url: str) -> str:
"""从URL获取文件扩展名"""
# 移除查询参数
clean_url = url.split('?')[0]
ext = os.path.splitext(clean_url)[1].lower()
if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']:
ext = '.jpg' # 默认扩展名
return ext
def download_image(img_url: str, save_dir: str, timeout: int = 15) -> str | None:
"""
下载图片到本地
Args:
img_url: 图片URL
save_dir: 保存目录
timeout: 超时时间
Returns:
本地文件路径,失败返回 None
"""
try:
ensure_dir(save_dir)
url_hash = get_file_hash(img_url)
ext = get_file_extension(img_url)
local_filename = f"{url_hash}{ext}"
local_path = os.path.join(save_dir, local_filename)
# 如果已下载过,直接返回路径
if os.path.exists(local_path):
return local_path
# 下载图片
response = requests.get(img_url, headers=HEADERS, timeout=timeout)
if response.status_code == 200:
with open(local_path, 'wb') as f:
f.write(response.content)
return local_path
else:
print(f" 图片下载失败 ({response.status_code}): {img_url}")
return None
except Exception as e:
print(f" 图片下载出错: {img_url} - {e}")
return None
def safe_filename(name: str, max_length: int = 50) -> str:
"""
生成安全的文件名
Args:
name: 原始名称
max_length: 最大长度
Returns:
安全的文件名
"""
# 移除或替换不安全字符
unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\r', '\t']
for char in unsafe_chars:
name = name.replace(char, '_')
# 去除首尾空格
name = name.strip()
# 截断长度
if len(name) > max_length:
name = name[:max_length]
return name
def make_absolute_url(base_url: str, relative_url: str) -> str:
"""将相对URL转为绝对URL"""
return urljoin(base_url, relative_url)