1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
| cat down_nexus_3.X_xml_md5_sha1.py #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 使用 Nexus 3 REST API 下载所有 artifacts - 完整版本 下载所有目录中的 maven-metadata.xml, .md5, .sha1 文件 """ import os import sys import time import json import requests from requests.auth import HTTPBasicAuth from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry
def setup_requests_session(username, password): """创建带重试机制的会话""" session = requests.Session() session.auth = HTTPBasicAuth(username, password)
# 配置重试机制 retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "HEAD"] )
adapter = HTTPAdapter(max_retries=retry_strategy) session.mount("http://", adapter) session.mount("https://", adapter)
return session
def download_file(session, url, local_path): """下载单个文件""" try: response = session.get(url, stream=True, timeout=30)
if response.status_code == 200: # 创建目录 local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir, exist_ok=True)
# 下载文件 with open(local_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk)
return True, f"成功下载 ({os.path.getsize(local_path)} bytes)" elif response.status_code == 404: return False, "文件不存在 (404)" else: return False, f"HTTP {response.status_code}"
except requests.exceptions.Timeout: return False, "超时" except Exception as e: return False, str(e)
def find_metadata_urls_from_components(components, base_url, repository): """从组件数据中提取所有可能的metadata文件URL""" metadata_urls = set() # 使用set避免重复
for component in components: group = component.get("group", "") name = component.get("name", "") version = component.get("version", "") path = component.get("assets", [{}])[0].get("path", "") if component.get("assets") else ""
if not all([group, name]): continue
# 项目级别的metadata project_dir = group.replace('.', '/') + '/' + name project_metadata_url = f"{base_url}/repository/{repository}/{project_dir}/maven-metadata.xml" metadata_urls.add(project_metadata_url)
# 如果组件有版本信息,尝试版本目录下的metadata if version: version_dir = f"{project_dir}/{version}" version_metadata_url = f"{base_url}/repository/{repository}/{version_dir}/maven-metadata.xml" metadata_urls.add(version_metadata_url)
# 尝试从path中提取更多可能的metadata路径 if path: # 尝试找到路径中的所有可能目录 path_parts = path.split('/') for i in range(len(path_parts) - 1): # 构建目录路径 dir_path = '/'.join(path_parts[:i+1]) if dir_path.endswith('/'): dir_path = dir_path[:-1]
# 检查是否可能是包含metadata的目录 # 排除文件路径,只保留目录 if not path_parts[i].endswith(('.jar', '.pom', '.war', '.ear', '.zip', '.tar.gz', '.tgz')): metadata_url = f"{base_url}/repository/{repository}/{dir_path}/maven-metadata.xml" metadata_urls.add(metadata_url)
return list(metadata_urls)
def crawl_directory_for_metadata(session, base_url, repository, start_dir=""): """递归爬取仓库目录结构,查找所有可能的metadata文件""" metadata_urls = set() urls_to_check = [f"{base_url}/repository/{repository}/{start_dir}"]
print(f"开始爬取目录结构...")
while urls_to_check: current_url = urls_to_check.pop(0)
try: # 尝试列出目录内容 response = session.get(current_url, timeout=30)
if response.status_code == 200: # 检查是否是HTML页面(Nexus的目录浏览页面) if 'text/html' in response.headers.get('content-type', ''): # 这里可以解析HTML来获取子目录 # 简化处理:基于已知的目录结构猜测 pass
# 检查这个目录下是否有maven-metadata.xml metadata_url = current_url.rstrip('/') + '/maven-metadata.xml' metadata_urls.add(metadata_url)
except: continue
# 避免无限循环,限制深度 if len(current_url.split('/')) > 15: continue
return list(metadata_urls)
def find_all_metadata_files(session, base_url, repository): """查找仓库中所有可能的metadata文件""" metadata_files = []
# 方法1: 通过搜索API查找 print("方法1: 通过搜索API查找metadata文件...") search_url = f"{base_url}/service/rest/v1/search"
# 搜索maven-metadata.xml文件 params = { "repository": repository, "q": "maven-metadata.xml" }
try: continuation_token = None while True: if continuation_token: params["continuationToken"] = continuation_token
response = session.get(search_url, params=params, timeout=30)
if response.status_code == 200: data = response.json()
for item in data.get("items", []): download_url = item.get("downloadUrl", "") path = item.get("path", "")
if download_url and path: metadata_files.append((download_url, path))
continuation_token = data.get("continuationToken") if not continuation_token: break else: print(f"搜索失败: HTTP {response.status_code}") break
except Exception as e: print(f"搜索API出错: {e}")
# 方法2: 如果搜索API找不到,尝试直接访问已知路径 if not metadata_files: print("方法2: 尝试直接访问已知路径...")
# 获取所有组件 components = [] continuation_token = None
try: while True: url = f"{base_url}/service/rest/v1/components" params = {"repository": repository} if continuation_token: params["continuationToken"] = continuation_token
response = session.get(url, params=params, timeout=30)
if response.status_code == 200: data = response.json() components.extend(data.get("items", []))
continuation_token = data.get("continuationToken") if not continuation_token: break else: break except: pass
# 从组件中提取可能的metadata路径 if components: print(f"从 {len(components)} 个组件中提取metadata路径...") metadata_urls = find_metadata_urls_from_components(components, base_url, repository)
for url in metadata_urls: path = url.replace(f"{base_url}/repository/{repository}/", "") metadata_files.append((url, path))
return metadata_files
def download_metadata_files(session, metadata_files, local_dir): """下载所有找到的metadata文件""" downloaded_files = 0 skipped_files = 0 failed_files = []
print(f"\n开始下载 {len(metadata_files)} 个metadata文件...") print("="*60)
for i, (url, path) in enumerate(metadata_files, 1): local_path = os.path.join(local_dir, path)
# 检查文件是否已存在 if os.path.exists(local_path): file_size = os.path.getsize(local_path) print(f"[{i}/{len(metadata_files)}] 文件已存在: {path} ({file_size} bytes)") skipped_files += 1 continue
print(f"[{i}/{len(metadata_files)}] 下载: {path}")
# 下载主文件 success, message = download_file(session, url, local_path)
if success: downloaded_files += 1 print(f" ✓ {message}")
# 尝试下载对应的md5和sha1文件 for ext in ['.md5', '.sha1']: ext_url = url + ext ext_path = local_path + ext
if not os.path.exists(ext_path): ext_success, ext_message = download_file(session, ext_url, ext_path) if ext_success: print(f" ✓ 下载{ext}成功") else: print(f" ⚠ 下载{ext}失败: {ext_message}") else: failed_files.append((url, local_path, message)) print(f" ✗ 失败: {message}")
# 小延迟,避免对服务器造成压力 time.sleep(0.1)
return downloaded_files, skipped_files, failed_files
def download_all_metadata(): """主函数:下载所有metadata文件""" base_url = "http://1.1.142.75:8081" repository = "maven-snapshots" local_dir = "/home/aliyun-maven-3/maven-snapshots" username = "admin" password = "Hs2wsx"
# 创建会话 session = setup_requests_session(username, password)
print("="*60) print("Nexus 3 Metadata文件下载工具") print("="*60) print(f"服务器: {base_url}") print(f"仓库: {repository}") print(f"本地目录: {local_dir}") print("="*60)
# 查找所有metadata文件 metadata_files = find_all_metadata_files(session, base_url, repository)
if not metadata_files: print("未找到任何metadata文件") return
print(f"找到 {len(metadata_files)} 个可能的metadata文件")
# 去重处理 unique_files = {} for url, path in metadata_files: # 按路径去重 unique_files[path] = (url, path)
metadata_files = list(unique_files.values()) print(f"去重后剩余 {len(metadata_files)} 个文件")
# 下载文件 downloaded_files, skipped_files, failed_files = download_metadata_files( session, metadata_files, local_dir )
# 打印总结 print("\n" + "="*60) print("下载完成!") print("="*60) print(f"找到文件: {len(metadata_files)}") print(f"成功下载: {downloaded_files}") print(f"跳过(已存在): {skipped_files}") print(f"下载失败: {len(failed_files)}")
if failed_files: print(f"\n失败文件详情 (前10个):") for url, path, error in failed_files[:10]: print(f" 文件: {os.path.basename(path)}") print(f" 错误: {error}")
if len(failed_files) > 10: print(f" ... 还有 {len(failed_files) - 10} 个失败项")
# 保存失败记录到文件 log_file = os.path.join(local_dir, "metadata_download_failures.log") with open(log_file, 'w', encoding='utf-8') as f: f.write("metadata文件下载失败列表:\n") f.write("="*60 + "\n") for url, path, error in failed_files: f.write(f"文件: {path}\n") f.write(f"URL: {url}\n") f.write(f"错误: {error}\n") f.write("-" * 50 + "\n") print(f"\n详细失败记录已保存到: {log_file}")
if __name__ == "__main__": try: download_all_metadata() except KeyboardInterrupt: print("\n\n用户中断,程序退出") sys.exit(0) except Exception as e: print("程序执行出错:", str(e)) import traceback traceback.print_exc() sys.exit(1)
|