EasySpider/ExecuteStage/llm_beta.py
2024-12-24 00:14:35 +08:00

37 lines
1.1 KiB
Python

from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
# 加载 Llama 3.2 视觉模型和处理器
model_name = "meta-llama/Llama-3.2-11B-Vision" # 请根据实际模型路径替换
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name)
# 处理网页截图并提取结构
def predict_structure_from_image(image_path):
# 加载图像
image = Image.open(image_path).convert("RGB")
# 预处理图像
inputs = processor(images=image, return_tensors="pt")
# 生成描述(结构描述)
outputs = model.generate(
inputs["pixel_values"],
max_length=512,
num_beams=5,
early_stopping=True
)
description = processor.decode(outputs[0], skip_special_tokens=True)
return description
# 示例使用
if __name__ == "__main__":
# 提供网页截图的路径
image_path = "webpage_screenshot.png" # 请替换为实际的图像文件路径
# 预测结构
predicted_structure = predict_structure_from_image(image_path)
print("预测的结构:", predicted_structure)