Blue customers come from 120+ countries around the world. So it makes sense to have Blue in as many languages as possible.
Blue is currently available in 24 languages:
Arabic (ar)
Bengali (bn)
Chinese (zh)
Czech (cz)
English (en)
French (fr)
Georgian (ka)
German (de)
Hindi (hi)
Hungarian (hu)
Indonesian (id)
Italian (it)
Japanese (ja)
Khmer (km)
Korean (ko)
Latvian (lv)
Marathi (mr)
Portuguese (pt)
Romanian (ro)
Russian (ru)
Spanish (es)
Swedish (se)
Thai (th)
Vietnamese (vi)
Introduction
We have transitioned from our previous approach of having community translation to automatic translation using GPT4. This reduces the time and cost by 97% while keeping an acceptable level of accuracy.
Getting Translation Files
To ensure the most up-to-date and accurate translations, we utilize a Python script to pull the latest language JSON files from our development server. This script consolidates these files into a master CSV file
import gitlabimport osimport base64import jsonimport csvfrom collections import defaultdictdefflatten_dict(d,parent_key='',sep='.'): items = []for k, v in d.items(): new_key = parent_key + sep + k if parent_key else kifisinstance(v, dict): items.extend(flatten_dict(v, new_key, sep=sep).items())else: items.append((new_key, v))returndict(items)# Configuration for GitLabGITLAB_TOKEN ='GITLAB TOKEN GOES HERE'GITLAB_PROJECT_ID ='8477037'REPO_PATH ='src/locales'LOCAL_SAVE_PATH ='downloaded_json_files'# Local directory to save files# Initialize GitLabgl = gitlab.Gitlab('https://gitlab.com', private_token=GITLAB_TOKEN)project = gl.projects.get(GITLAB_PROJECT_ID)# Create local directory if not existsifnot os.path.exists(LOCAL_SAVE_PATH): os.makedirs(LOCAL_SAVE_PATH)# Get files from GitLab and save locallyfiles = project.repository_tree(path=REPO_PATH, ref='dev', all=True)#ref is the branchfor file in files:if file['type']=='blob'and file['name'].endswith('.json'): file_data = project.files.get(file_path=file['path'], ref='dev') decoded_content = base64.b64decode(file_data.content).decode('utf-8')withopen(os.path.join(LOCAL_SAVE_PATH, file['name']), 'w')as f: f.write(decoded_content)print("Download complete.")# Process each JSON filetranslations =defaultdict(dict)for file_name in os.listdir(LOCAL_SAVE_PATH):if file_name.endswith('.json'):withopen(os.path.join(LOCAL_SAVE_PATH, file_name), 'r')as f: data = json.load(f) flattened_data =flatten_dict(data) lang = file_name.split('.')[0]for key, value in flattened_data.items(): translations[key][lang] = value# Sort languages, ensuring 'en' is firstlanguages =sorted([lang for lang in translations[next(iter(translations))].keys() if lang !='en'])fieldnames = ['Key','en'] + languages# Write to CSVwithopen('translations.csv', 'w', newline='', encoding='utf-8')as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader()for key, langs in translations.items(): row ={'Key': key} row.update(langs) writer.writerow(row)print("CSV creation complete.")
Checking for Updated English Text
There is a rare case where the English itself has been updated. This is not a new row, but an update to an existing row, meaning all the translations for that row must be updated.
So, we must store the previous version we processed and then check the new version against the old version.
In the new version:
Check if all English key/value pairs are the same.
If there is a difference, delete the translations so we can retranslate those rows.
Automatic Translation
Leveraging GPT-4, our Python script automatically translates all missing strings. This method is not only efficient but also maintains the nuances of each language.
In Blue, variables or placeholders are enclosed in curly brackets {}and are not meant to be translated. Our Python script is designed to correct inconsistencies using curly brackets
across different language translations automatically. This is because GPT sometimes translates the content inside the {}.
import pandas as pdfrom openai import OpenAI# Initialize OpenAI client with your API keyclient =OpenAI(api_key='OPENAIKEY')# Read CSV filecsv_file_path ='translations.csv'# Ensure this file is in the same folder as the scriptdf = pd.read_csv(csv_file_path)# Function to find all placeholder tagsdeffind_tags(text):return re.findall(r"\{[^\}]+\}", text)# Language code to full name mappinglanguage_mapping ={'ar':'Arabic','bn':'Bengali','zh':'Chinese','cz':'Czech','en':'English','fr':'French','ka':'Georgian','de':'German','hi':'Hindi','hu':'Hungarian','id':'Indonesian','it':'Italian','ja':'Japanese','km':'Khmer','ko':'Korean','lv':'Latvian','mr':'Marathi','pt':'Portuguese','ro':'Romanian','ru':'Russian','es':'Spanish','se':'Swedish','th':'Thai','vi':'Vietnamese'}# Function to translate text using OpenAI GPTdeftranslate_text(text,target_language_code): target_language = language_mapping.get(target_language_code, target_language_code)# Default to code if not found description =""" We are developing a Python script that automates language localization for our software application. This script reads a CSV file with English text and its corresponding translations, and it fills in missing translations using OpenAI's GPT-4. Please provide precise translations for the following English text in the specified target language. Do not include single quotes around the translated text. DO NOT translate text that is within curly brackets {} as these are system tags.
""" prompt = f"{description}\n\nTranslate the following English text to {target_language}: '{text}'\n\nTranslation:" response = client.chat.completions.create( model='gpt-4', # or your preferred model messages=[{'role': 'system', 'content': prompt}], temperature=0.2, max_tokens=300# Adjust based on expected length of translations ) translated_text = response.choices[0].message.content.strip()# Remove both single and double quotation marks from the translation as GPT sometimes includes these translated_text = translated_text.strip("'").strip('"')return translated_text# Function to restore untranslated tagsdefrestore_tags(original_text,translated_text): original_tags =find_tags(original_text) translated_tags =find_tags(translated_text)for ot, tt inzip(original_tags, translated_tags): translated_text = translated_text.replace(tt, ot)return translated_text# Translate and update dataframe, saving after each row for index, row in df.iterrows(): updated =Falsefor language in df.columns[2:]:if pd.isna(row[language])or row[language].strip()=='': translated_text =translate_text(row['en'], language) df.at[index, language]= translated_textprint(f"Original: {row['en']} | Language: {language} | Translation: {translated_text}") updated =True# Save the updated dataframe to the CSV file after each row updateif updated: df.to_csv(csv_file_path, index=False)```
Deployment
Finally, the translated content is prepared for deployment. Our script processes the updated CSV file, generating individual language JSON files. These files are then deployed to our development environment within a new branch, ensuring a seamless update process.
import gitlab
import pandas as pd
import json
import os
import uuid
from gitlab.exceptions import GitlabGetError
# Configuration for GitLab
GITLAB_TOKEN = 'GITLAB TOKEN'
GITLAB_PROJECT_ID = '8477037'
REPO_PATH = 'src/locales'
LOCAL_SAVE_PATH = 'uploaded_json_files' # Local directory to save JSON files
BRANCH_NAME = f'update-translations_{uuid.uuid4()}' # Generate a unique branch name
# Initialize GitLab
gl = gitlab.Gitlab('https://gitlab.com', private_token=GITLAB_TOKEN)
project = gl.projects.get(GITLAB_PROJECT_ID)
# Read CSV File
df = pd.read_csv('translations.csv')
# Convert DataFrame to individual JSON files
if not os.path.exists(LOCAL_SAVE_PATH):
os.makedirs(LOCAL_SAVE_PATH)
languages = df.columns[2:]
for lang in languages:
lang_data = df[['Key', lang]].dropna()
json_data = {}
for _, row in lang_data.iterrows():
keys = row['Key'].split('.')
d = json_data
for key in keys[:-1]:
d = d.setdefault(key, {})
d[keys[-1]] = row[lang]
with open(os.path.join(LOCAL_SAVE_PATH, f'{lang}.json'), 'w', encoding='utf-8') as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
# Create a new branch
branch = project.branches.create({'branch': BRANCH_NAME, 'ref': 'dev'})
# Upload or update the JSON files in the new branch
for file_name in os.listdir(LOCAL_SAVE_PATH):
file_path = os.path.join(REPO_PATH, file_name)
with open(os.path.join(LOCAL_SAVE_PATH, file_name), 'r') as f:
file_content = f.read()
try:
# Try to get the file, if it exists
file = project.files.get(file_path=file_path, ref=BRANCH_NAME)
# If the file exists, update it
file.content = file_content
file.save(branch=BRANCH_NAME, commit_message=f'Update {file_name}')
print(f"Updated existing file: {file_name}")
except GitlabGetError:
# If the file does not exist, create a new one
project.files.create({
'file_path': file_path,
'branch': BRANCH_NAME,
'content': file_content,
'commit_message': f'Create {file_name}'
})
print(f"Created new file: {file_name}")
# Create a merge request
project.mergerequests.create({
'source_branch': BRANCH_NAME,
'target_branch': 'dev',
'title': 'Update Translations'
})
print("Merge request created.")