1. 环境准备
pip install transformers
2. 下载机器翻译模型:
2.1 代码从hugging face平台下载
from transformers import MarianMTModel, MarianTokenizer# 指定模型名称
model_name = "Helsinki-NLP/opus-mt-zh-en" # 中译英模型# 下载并保存分词器到本地
tokenizer = MarianTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("./local_opus_mt_zh-en")# 下载并保存模型到本地
model = MarianMTModel.from_pretrained(model_name)
model.save_pretrained("./local_opus_mt_zh-en")
2.2 手动下载模型
model_数据集-阿里云天池
2.3 加载模型进行翻译
2.3.1 翻译文本
from transformers import MarianMTModel, MarianTokenizer# 指定本地模型和分词器的路径
local_model_path = "./local_opus_mt_zh_en"# 从本地加载分词器
tokenizer = MarianTokenizer.from_pretrained(local_model_path)# 从本地加载模型
model = MarianMTModel.from_pretrained(local_model_path)# 测试翻译
text = "你好"
inputs = tokenizer(text, return_tensors="pt")
translated = model.generate(**inputs)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(translated_text) # 输出: hello
2.3.2 翻译json串
import json
from transformers import MarianMTModel, MarianTokenizer# 指定本地模型和分词器的路径
local_model_path = "./local_opus_mt_zh_en"# 从本地加载分词器
tokenizer = MarianTokenizer.from_pretrained(local_model_path)# 从本地加载模型
model = MarianMTModel.from_pretrained(local_model_path)# 输入的 JSON 字符串
input_json = '''
{"NUM1": "你好世界","NUM2": "又是开心的一天:","NUM3": "你在哪里","NUM4": "嘿嘿嘿"
}
'''# 解析 JSON 字符串
input_data = json.loads(input_json)# 需要翻译的字段
fields_to_translate = ["NUM1", "NUM2","NUM3","NUM4"]# 翻译函数
def translate_text(text, tokenizer, model):inputs = tokenizer(text, return_tensors="pt")translated = model.generate(**inputs)translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)return translated_text# 遍历需要翻译的字段并翻译
for field in fields_to_translate:if field in input_data:input_data[field] = translate_text(input_data[field], tokenizer, model)# 将翻译后的数据转换为 JSON 字符串
output_json = json.dumps(input_data, ensure_ascii=False, indent=2)# 打印输出
print(output_json)
2.4 使用gpu加速翻译
import time
import torch
from transformers import MarianMTModel, MarianTokenizer# 指定本地模型和分词器的路径
local_model_path = "./local_opus_mt_zh_en"# 从本地加载分词器
tokenizer = MarianTokenizer.from_pretrained(local_model_path)# 从本地加载模型
model = MarianMTModel.from_pretrained(local_model_path)# 检查是否有 GPU,并设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)# 测试翻译速度
text = "你好,世界!"
start_time = time.time()# 将输入数据移动到 GPU
inputs = tokenizer(text, return_tensors="pt").to(device)# 生成翻译结果
translated = model.generate(**inputs)# 将结果移回 CPU 并解码
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)end_time = time.time()
print(f"翻译结果: {translated_text}")
print(f"翻译耗时: {end_time - start_time:.4f} 秒")