
| cat down_nexus_3.X_xml_md5_sha1.py #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 使用 Nexus 3 REST API 下载所有 artifacts - 完整版本 下载所有目录中的 maven-metadata.xml, .md5, .sha1 文件 """ import os import sys import time import json import requests from requests.auth import HTTPBasicAuth from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry
def setup_requests_session(username, password): """创建带重试机制的会话""" session = requests.Session() session.auth = HTTPBasicAuth(username, password)
# 配置重试机制 retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "HEAD"] )
adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("http://", adapter) session.mount("https://", adapter)
return session
def download_file(session, url, local_path): """下载单个文件""" try: response = session.get(url, stream=True, timeout=30)
if response.status_code == 200: # 创建目录 local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir, exist_ok=True)
# 下载文件 with open(local_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk)
return True, f"成功下载 ({os.path.getsize(local_path)} bytes)" elif response.status_code == 404: return False, "文件不存在 (404)" else: return False, f"HTTP {response.status_code}"
except requests.exceptions.Timeout: return False, "超时" except Exception as e: return False, str(e)
def find_metadata_urls_from_components(components, base_url, repository): """从组件数据中提取所有可能的metadata文件URL""" metadata_urls = set() # 使用set避免重复
for component in components: group = component.get("group", "") name = component.get("name", "") version = component.get("version", "") path = component.get("assets", [{}])[0].get("path", "") if component.get("assets") else ""
if not all([group, name]): continue
# 项目级别的metadata project_dir = group.replace('.', '/') + '/' + name project_metadata_url = f"{base_url}/repository/{repository}/{project_dir}/maven-metadata.xml" metadata_urls.add(project_metadata_url)
# 如果组件有版本信息,尝试版本目录下的metadata if version: version_dir = f"{project_dir}/{version}" version_metadata_url = f"{base_url}/repository/{repository}/{version_dir}/maven-metadata.xml" metadata_urls.add(version_metadata_url)
# 尝试从path中提取更多可能的metadata路径 if path: # 尝试找到路径中的所有可能目录 path_parts = path.split('/') for i in range(len(path_parts) - 1): # 构建目录路径 dir_path = '/'.join(path_parts[:i+1]) if dir_path.endswith('/'): dir_path = dir_path[:-1]
# 检查是否可能是包含metadata的目录 # 排除文件路径,只保留目录 if not path_parts[i].endswith(('.jar', '.pom', '.war', '.ear', '.zip', '.tar.gz', '.tgz')): metadata_url = f"{base_url}/repository/{repository}/{dir_path}/maven-metadata.xml" metadata_urls.add(metadata_url)
return list(metadata_urls)
def crawl_directory_for_metadata(session, base_url, repository, start_dir=""): """递归爬取仓库目录结构,查找所有可能的metadata文件""" metadata_urls = set() urls_to_check = [f"{base_url}/repository/{repository}/{start_dir}"]
print(f"开始爬取目录结构...")
while urls_to_check: current_url = urls_to_check.pop(0)
try: # 尝试列出目录内容 response = session.get(current_url, timeout=30)
if response.status_code == 200: # 检查是否是HTML页面(Nexus的目录浏览页面) if 'text/html' in response.headers.get('content-type', ''): # 这里可以解析HTML来获取子目录 # 简化处理:基于已知的目录结构猜测 pass
# 检查这个目录下是否有maven-metadata.xml metadata_url = current_url.rstrip('/') + '/maven-metadata.xml' metadata_urls.add(metadata_url)
except: continue
# 避免无限循环,限制深度 if len(current_url.split('/')) > 15: continue
return list(metadata_urls)
def find_all_metadata_files(session, base_url, repository): """查找仓库中所有可能的metadata文件""" metadata_files = []
# 方法1: 通过搜索API查找 print("方法1: 通过搜索API查找metadata文件...") search_url = f"{base_url}/service/rest/v1/search"
# 搜索maven-metadata.xml文件 params = { "repository": repository, "q": "maven-metadata.xml" }
try: continuation_token = None while True: if continuation_token: params["continuationToken"] = continuation_token
response = session.get(search_url, params=params, timeout=30)
if response.status_code == 200: data = response.json()
for item in data.get("items", []): download_url = item.get("downloadUrl", "") path = item.get("path", "")
if download_url and path: metadata_files.append((download_url, path))
continuation_token = data.get("continuationToken") if not continuation_token: break else: print(f"搜索失败: HTTP {response.status_code}") break
except Exception as e: print(f"搜索API出错: {e}")
# 方法2: 如果搜索API找不到,尝试直接访问已知路径 if not metadata_files: print("方法2: 尝试直接访问已知路径...")
# 获取所有组件 components = [] continuation_token = None
try: while True: url = f"{base_url}/service/rest/v1/components" params = {"repository": repository} if continuation_token: params["continuationToken"] = continuation_token
response = session.get(url, params=params, timeout=30)
if response.status_code == 200: data = response.json() components.extend(data.get("items", []))
continuation_token = data.get("continuationToken") if not continuation_token: break else: break except: pass
# 从组件中提取可能的metadata路径 if components: print(f"从 {len(components)} 个组件中提取metadata路径...") metadata_urls = find_metadata_urls_from_components(components, base_url, repository)
for url in metadata_urls: path = url.replace(f"{base_url}/repository/{repository}/", "") metadata_files.append((url, path))
return metadata_files
def download_metadata_files(session, metadata_files, local_dir): """下载所有找到的metadata文件""" downloaded_files = 0 skipped_files = 0 failed_files = []
print(f"\n开始下载 {len(metadata_files)} 个metadata文件...") print("="*60)
for i, (url, path) in enumerate(metadata_files, 1): local_path = os.path.join(local_dir, path)
# 检查文件是否已存在 if os.path.exists(local_path): file_size = os.path.getsize(local_path) print(f"[{i}/{len(metadata_files)}] 文件已存在: {path} ({file_size} bytes)") skipped_files += 1 continue
print(f"[{i}/{len(metadata_files)}] 下载: {path}")
# 下载主文件 success, message = download_file(session, url, local_path)
if success: downloaded_files += 1 print(f" ✓ {message}")
# 尝试下载对应的md5和sha1文件 for ext in ['.md5', '.sha1']: ext_url = url + ext ext_path = local_path + ext
if not os.path.exists(ext_path): ext_success, ext_message = download_file(session, ext_url, ext_path) if ext_success: print(f" ✓ 下载{ext}成功") else: print(f" ⚠ 下载{ext}失败: {ext_message}") else: failed_files.append((url, local_path, message)) print(f" ✗ 失败: {message}")
# 小延迟,避免对服务器造成压力 time.sleep(0.1)
return downloaded_files, skipped_files, failed_files
def download_all_metadata(): """主函数:下载所有metadata文件""" base_url = "http://1.1.142.75:8081" repository = "maven-snapshots" local_dir = "/home/aliyun-maven-3/maven-snapshots" username = "admin" password = "Hs2wsx"
# 创建会话 session = setup_requests_session(username, password)
print("="*60) print("Nexus 3 Metadata文件下载工具") print("="*60) print(f"服务器: {base_url}") print(f"仓库: {repository}") print(f"本地目录: {local_dir}") print("="*60)
# 查找所有metadata文件 metadata_files = find_all_metadata_files(session, base_url, repository)
if not metadata_files: print("未找到任何metadata文件") return
print(f"找到 {len(metadata_files)} 个可能的metadata文件")
# 去重处理 unique_files = {} for url, path in metadata_files: # 按路径去重 unique_files[path] = (url, path)
metadata_files = list(unique_files.values()) print(f"去重后剩余 {len(metadata_files)} 个文件")
# 下载文件 downloaded_files, skipped_files, failed_files = download_metadata_files( session, metadata_files, local_dir )
# 打印总结 print("\n" + "="*60) print("下载完成!") print("="*60) print(f"找到文件: {len(metadata_files)}") print(f"成功下载: {downloaded_files}") print(f"跳过(已存在): {skipped_files}") print(f"下载失败: {len(failed_files)}")
if failed_files: print(f"\n失败文件详情 (前10个):") for url, path, error in failed_files[:10]: print(f" 文件: {os.path.basename(path)}") print(f" 错误: {error}")
if len(failed_files) > 10: print(f" ... 还有 {len(failed_files) - 10} 个失败项")
# 保存失败记录到文件 log_file = os.path.join(local_dir, "metadata_download_failures.log") with open(log_file, 'w', encoding='utf-8') as f: f.write("metadata文件下载失败列表:\n") f.write("="*60 + "\n") for url, path, error in failed_files: f.write(f"文件: {path}\n") f.write(f"URL: {url}\n") f.write(f"错误: {error}\n") f.write("-" * 50 + "\n") print(f"\n详细失败记录已保存到: {log_file}")
if __name__ == "__main__": try: download_all_metadata() except KeyboardInterrupt: print("\n\n用户中断,程序退出") sys.exit(0) except Exception as e: print("程序执行出错:", str(e)) import traceback traceback.print_exc() sys.exit(1)
|