mirror of
https://github.com/open-goal/jak-project.git
synced 2024-10-20 11:26:18 -04:00
319 lines
16 KiB
Python
319 lines
16 KiB
Python
import glob
|
||
import json
|
||
import re
|
||
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--fix", action="store_true")
|
||
parser.set_defaults(fix=False)
|
||
args = parser.parse_args()
|
||
|
||
# TODO - trim strings
|
||
|
||
# fmt: off
|
||
JAK1_ALLOWED_CHARACTERS = [
|
||
"_", # NOTE - not an actual underscore, adds a long space!
|
||
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
|
||
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
|
||
"'", "!", "(", ")", "+", "-", ",", ".", "/", ":", "=", "<", ">", "*", "%", "?", "\"",
|
||
"`", "ˇ", "¨", "º", "¡", "¿", "Æ", "Ç", "ß", "™", "、", " ", "Å", "Ø", "Ą", "Ę", "Ł", "Ż",
|
||
"Ñ", "Ã", "Õ", "Á", "É", "Í", "Ó", "Ú", "Ć", "Ń", "Ś", "Ź", "Ő", "Ű", "Â", "Ê", "Î", "Ô", "Û", "À", "È", "Ì", "Ò", "Ù", "Ä", "Ë", "Ï", "Ö", "ö", "Ü", "Ė","Č","Š","Ž","Ų","Ū","Į",
|
||
"海", "界", "学", "ワ", "ヲ", "ン", "岩", "旧", "空", "ヮ", "撃", "賢", "湖", "口", "行", "合", "士", "寺", "山", "者", "所", "書", "小", "沼", "上", "城", "場", "出", "闇", "遺", "黄", "屋", "下", "家", "火", "花", "レ", "ロ", "青", "・", "゛", "゜", "ー", "『", "』", "宝", "石", "赤", "跡", "川", "戦", "村", "隊", "台", "長", "鳥", "艇", "洞", "道", "発", "飛", "噴", "池", "中", "塔", "島", "部", "砲", "産", "眷", "力", "緑", "岸", "像", "谷", "心", "森", "水", "船", "世",
|
||
"ぁ", "あ", "ぃ", "い", "ぅ", "う", "ぇ", "え", "ぉ", "お", "か", "き", "く", "け", "こ", "さ", "し", "す", "せ", "そ", "た", "ち", "っ", "つ", "て", "と", "な", "に", "ぬ", "ね", "の", "は", "ひ", "ふ", "へ", "ほ", "ま", "み", "む", "め", "も", "ゃ", "や", "ゅ", "ゆ", "ょ", "よ", "ら", "り", "る", "れ", "ろ", "ゎ", "わ", "を", "ん",
|
||
"が", "ぎ", "ぐ", "げ", "ご", "ざ", "じ", "ず", "ぜ", "ぞ", "だ", "ぢ", "づ", "で", "ど", "ば", "び", "ぶ", "べ", "ぼ",
|
||
"ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
|
||
"ァ", "ア", "ィ", "イ", "ゥ", "ウ", "ェ", "エ", "ォ", "オ", "カ", "キ", "ク", "ケ", "コ", "サ", "シ", "ス", "セ", "ソ", "タ", "チ", "ッ", "ツ", "テ", "ト", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "ヒ", "フ", "ヘ", "ホ", "マ", "ミ", "ム", "メ", "モ", "ャ", "ヤ", "ュ", "ユ", "ョ", "ヨ", "ラ", "リ", "ル",
|
||
"ヴ", "ガ", "ギ", "グ", "ゲ", "ゴ", "ザ", "ジ", "ズ", "ゼ", "ゾ", "ダ", "ヂ", "ヅ", "デ", "ド", "バ", "ビ", "ブ", "ベ", "ボ",
|
||
"パ", "ピ", "プ", "ペ", "ポ",
|
||
"~", "Œ"
|
||
]
|
||
|
||
JAK1_ALLOWED_CODES = [
|
||
"<TIL>",
|
||
"<PAD_X>", "<PAD_TRIANGLE>", "<PAD_CIRCLE>", "<PAD_SQUARE>"
|
||
]
|
||
|
||
JAK1_AUTO_REPLACEMENTS = {
|
||
"ª": "º",
|
||
"\n": "",
|
||
"’": "'",
|
||
}
|
||
|
||
# TODO - check for korean text
|
||
JAK2_ALLOWED_CHARACTERS = [
|
||
"_", # NOTE - not an actual underscore, adds a long space!
|
||
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
|
||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
||
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
|
||
"'", "!", "(", ")", "+", "-", ",", ".", "/", ":", "=", "<", ">", "*", "%", "?", "\"",
|
||
"`", "ˇ", "¨", "º", "¡", "¿", "Æ", "Ç", "ß", "™", "、", " ", "Å", "Ø", "Ą", "Ę", "Ł", "Ż",
|
||
"æ", "ø", "œ",
|
||
"Ñ", "Ã", "Õ", "Á", "É", "Í", "Ó", "Ú", "Ć", "Ń", "Ś", "Ź", "Ő", "Ű", "Â", "Ê", "Î", "Ô", "Û", "À", "È", "Ì", "Ò", "Ù", "Ä", "Ë", "Ï", "Ö", "ö", "Ü", "Ė","Č","Š","Ž","Ų","Ū","Į",
|
||
"ñ", "á", "é", "í", "ó", "ú", "â", "ê", "î", "ô", "û", "à", "è", "ì", "ò", "ù", "ä", "ö", "ü", "ś", "å", "õ", "ã", "ę", "ż", "ć", "ą", "ł", "ń", "ű", "ő", "ė","č","š","ž","ų","ū","į",
|
||
"・", "゛", "゜", "ー", "『", "』",
|
||
"海", "界", "学", "ワ", "ヲ", "ン", "岩", "旧", "空", "ヮ", "撃", "賢", "湖", "口", "行", "合", "士", "寺", "山", "者", "所", "書", "小", "沼", "上", "城", "場", "出", "闇", "遺", "黄", "屋", "下", "家", "火", "花", "レ", "ロ", "青", "宝", "石", "赤", "跡", "川", "戦", "村", "隊", "台", "長", "鳥", "艇", "洞", "道", "発", "飛", "噴", "池", "中", "塔", "島", "部", "砲", "産", "眷", "力", "緑", "岸", "像", "谷", "心", "森", "水", "船", "世",
|
||
"位", "遺", "院", "映", "衛", "応", "下", "画", "解", "開", "外", "害", "蓋", "完", "換", "監", "間", "器", "記", "逆", "救", "金", "空", "掘", "警", "迎", "撃", "建", "源", "現", "言", "限", "個", "庫", "後", "語", "護", "交", "功", "向", "工", "攻", "溝", "行", "鉱", "降", "合", "告", "獄", "彩", "作", "山", "使", "始", "試", "字", "寺", "時", "示", "自", "式", "矢", "射", "者", "守", "手", "終", "週", "出", "所", "書", "勝", "章", "上", "乗", "場", "森", "進", "人", "水", "数", "制", "性", "成", "聖", "石", "跡", "先", "戦", "船", "選", "走", "送", "像", "造", "続", "対", "袋", "台", "弾", "地", "中", "敵", "転", "電", "塔", "頭", "動", "内", "日", "入", "年", "能", "廃", "排", "敗",
|
||
"発", "反", "必", "表", "武", "壁", "墓", "放", "方", "砲", "妨", "北", "本", "幕", "無", "迷", "面", "戻", "紋", "薬", "輸", "勇", "友", "遊", "容", "要", "利", "了", "量", "力", "練", "連", "録", "話", "墟", "脱", "旗", "破", "壊", "全", "滅", "機", "仲", "渓", "谷", "優", "探", "部", "索", "前", "右", "左", "会", "高", "低", "押", "切", "替", "秒", "箱", "泳", "~",
|
||
"闇", "以", "屋", "俺", "化", "界", "感", "気", "却", "曲", "継", "権", "見", "古", "好", "才", "士", "子", "次", "主", "種", "讐", "女", "小", "焼", "証", "神", "身", "寸", "世", "想", "退", "第", "着", "天", "倒", "到", "突", "爆", "番", "負", "復", "物", "眠", "予", "用", "落", "緑", "封", "印", "扉", "最", "刻", "足",
|
||
"ぁ", "あ", "ぃ", "い", "ぅ", "う", "ぇ", "え", "ぉ", "お", "か", "き", "く", "け", "こ", "さ", "し", "す", "せ", "そ", "た", "ち", "っ", "つ", "て", "と", "な", "に", "ぬ", "ね", "の", "は", "ひ", "ふ", "へ", "ほ", "ま", "み", "む", "め", "も", "ゃ", "や", "ゅ", "ゆ", "ょ", "よ", "ら", "り", "る", "れ", "ろ", "ゎ", "わ", "を", "ん",
|
||
"が", "ぎ", "ぐ", "げ", "ご", "ざ", "じ", "ず", "ぜ", "ぞ", "だ", "ぢ", "づ", "で", "ど", "ば", "び", "ぶ", "べ", "ぼ",
|
||
"ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
|
||
"ァ", "ア", "ィ", "イ", "ゥ", "ウ", "ェ", "エ", "ォ", "オ", "カ", "キ", "ク", "ケ", "コ", "サ", "シ", "ス", "セ", "ソ", "タ", "チ", "ッ", "ツ", "テ", "ト", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "ヒ", "フ", "ヘ", "ホ", "マ", "ミ", "ム", "メ", "モ", "ャ", "ヤ", "ュ", "ユ", "ョ", "ヨ", "ラ", "リ", "ル",
|
||
"ヴ", "ガ", "ギ", "グ", "ゲ", "ゴ", "ザ", "ジ", "ズ", "ゼ", "ゾ", "ダ", "ヂ", "ヅ", "デ", "ド", "バ", "ビ", "ブ", "ベ", "ボ",
|
||
"パ", "ピ", "プ", "ペ", "ポ",
|
||
"~", "Œ", "°", "ç"
|
||
]
|
||
|
||
JAK2_ALLOWED_CODES = [
|
||
"<TIL>", "<SUPERSCRIPT_QUOTE>",
|
||
"<PAD_X>", "<PAD_TRIANGLE>", "<PAD_CIRCLE>", "<PAD_SQUARE>", "<PAD_DPAD_UP>", "<PAD_DPAD_DOWN>", "<PAD_DPAD_ANY>", "<PAD_L1>", "<PAD_R1>", "<PAD_R2>", "<PAD_L2>", "<PAD_ANALOG_ANY>", "<PAD_ANALOG_LEFT_RIGHT>", "<PAD_ANALOG_UP_DOWN>", "<ICON_MISSION_COMPLETE>", "<ICON_MISSION_TODO>", "<FLAG_ITALIAN>", "<FLAG_SPAIN>", "<FLAG_GERMAN>", "<FLAG_FRANCE>", "<FLAG_UK>", "<FLAG_USA>", "<FLAG_KOREA>", "<FLAG_JAPAN>"
|
||
]
|
||
|
||
JAK2_AUTO_REPLACEMENTS = {
|
||
"ª": "º",
|
||
"\n": "",
|
||
"’": "'",
|
||
}
|
||
# fmt: on
|
||
|
||
invalid_characters_found = False
|
||
|
||
# TODO - reduce duplication
|
||
|
||
|
||
def jak1_is_allowed_code(pos, text):
|
||
# Find any occurences of allowed codes in the string
|
||
# if the position overlaps with these occurrences, it's allowed
|
||
for code in JAK1_ALLOWED_CODES:
|
||
for match in re.finditer(code, text):
|
||
if pos >= match.start() and pos <= match.end():
|
||
return match.end()
|
||
return -1
|
||
|
||
|
||
def jak1_char_allowed(char):
|
||
return char in JAK1_ALLOWED_CHARACTERS
|
||
|
||
|
||
def jak1_fix_character(char):
|
||
# First let's try upper-casing it, if that's allowed, let's use that instead
|
||
upper_case = char.upper()
|
||
if jak1_char_allowed(upper_case):
|
||
return upper_case
|
||
if char in JAK1_AUTO_REPLACEMENTS:
|
||
return JAK1_AUTO_REPLACEMENTS[char]
|
||
return char
|
||
|
||
|
||
def jak1_replace_character(string, position, new_character):
|
||
string_list = list(string)
|
||
string_list[position] = new_character
|
||
new_string = "".join(string_list)
|
||
return new_string
|
||
|
||
|
||
def lint_jak1_characters(text):
|
||
invalid_characters_found = False
|
||
pos = 0
|
||
while pos < len(text):
|
||
character = text[pos]
|
||
if not jak1_char_allowed(character):
|
||
# Check to see if it's an allowed code
|
||
code_end_pos = jak1_is_allowed_code(pos, text)
|
||
if code_end_pos == -1:
|
||
# If we are fixing instances, attempt to do so
|
||
char_fixed = False
|
||
if args.fix:
|
||
new_char = jak1_fix_character(character)
|
||
if new_char != character:
|
||
text = jak1_replace_character(text, pos, new_char)
|
||
char_fixed = True
|
||
if not char_fixed:
|
||
print(
|
||
"Character '{}' not allowed - Found in {}".format(
|
||
character, text
|
||
)
|
||
)
|
||
invalid_characters_found = True
|
||
pos = pos + 1
|
||
else:
|
||
# advance to the end of the code and continue checking
|
||
pos = code_end_pos
|
||
else:
|
||
pos = pos + 1
|
||
return invalid_characters_found, text
|
||
|
||
|
||
# Iterate through the translations making sure there are no characters that are not allowed
|
||
text_files = glob.glob("./game/assets/jak1/text/*.json")
|
||
|
||
for text_file in text_files:
|
||
print("Checking {}".format(text_file))
|
||
with open(text_file, encoding="utf-8") as f:
|
||
file_data = json.load(f)
|
||
for id, text in file_data.items():
|
||
invalid_chars_exist, new_text = lint_jak1_characters(text)
|
||
if args.fix:
|
||
file_data[id] = new_text
|
||
if invalid_chars_exist:
|
||
invalid_characters_found = True
|
||
if args.fix:
|
||
# save the modified file back out
|
||
with open(text_file, "w", encoding="utf-8") as f:
|
||
json.dump(file_data, f, indent=2, ensure_ascii=False)
|
||
f.write("\n")
|
||
|
||
subtitle_files = glob.glob("./game/assets/jak1/subtitle/*lines*.json")
|
||
|
||
for subtitle_file in subtitle_files:
|
||
print("Checking {}...".format(subtitle_file))
|
||
with open(subtitle_file, encoding="utf-8") as f:
|
||
file_data = json.load(f)
|
||
# Check Speakers
|
||
for id, text in file_data["speakers"].items():
|
||
invalid_chars_exist, new_text = lint_jak1_characters(text)
|
||
if args.fix and new_text != text:
|
||
file_data["speakers"][id] = new_text
|
||
if invalid_chars_exist:
|
||
invalid_characters_found = True
|
||
# Check Lines
|
||
for id, lines in file_data["cutscenes"].items():
|
||
for i, line in enumerate(lines):
|
||
invalid_chars_exist, new_text = lint_jak1_characters(line)
|
||
if args.fix and new_text != line:
|
||
lines[i] = new_text
|
||
if invalid_chars_exist:
|
||
invalid_characters_found = True
|
||
for id, lines in file_data["hints"].items():
|
||
for i, line in enumerate(lines):
|
||
invalid_chars_exist, new_text = lint_jak1_characters(line)
|
||
if args.fix and new_text != line:
|
||
lines[i] = new_text
|
||
if invalid_chars_exist:
|
||
invalid_characters_found = True
|
||
if args.fix:
|
||
# save the modified file back out
|
||
with open(subtitle_file, "w", encoding="utf-8") as f:
|
||
json.dump(file_data, f, indent=2, ensure_ascii=False)
|
||
f.write("\n")
|
||
|
||
|
||
def jak2_is_allowed_code(pos, text):
|
||
# Find any occurences of allowed codes in the string
|
||
# if the position overlaps with these occurrences, it's allowed
|
||
for code in JAK2_ALLOWED_CODES:
|
||
for match in re.finditer(code, text):
|
||
if pos >= match.start() and pos <= match.end():
|
||
return match.end()
|
||
return -1
|
||
|
||
|
||
def jak2_char_allowed(char):
|
||
return char in JAK2_ALLOWED_CHARACTERS
|
||
|
||
|
||
def jak2_fix_character(char):
|
||
if char in JAK2_AUTO_REPLACEMENTS:
|
||
return JAK2_AUTO_REPLACEMENTS[char]
|
||
return char
|
||
|
||
|
||
def jak2_replace_character(string, position, new_character):
|
||
string_list = list(string)
|
||
string_list[position] = new_character
|
||
new_string = "".join(string_list)
|
||
return new_string
|
||
|
||
|
||
def lint_jak2_characters(text):
|
||
invalid_characters_found = False
|
||
pos = 0
|
||
while pos < len(text):
|
||
character = text[pos]
|
||
if not jak2_char_allowed(character):
|
||
# Check to see if it's an allowed code
|
||
code_end_pos = jak2_is_allowed_code(pos, text)
|
||
if code_end_pos == -1:
|
||
# If we are fixing instances, attempt to do so
|
||
char_fixed = False
|
||
if args.fix:
|
||
new_char = jak2_fix_character(character)
|
||
if new_char != character:
|
||
text = jak2_replace_character(text, pos, new_char)
|
||
char_fixed = True
|
||
if not char_fixed:
|
||
print(
|
||
"Character '{}' not allowed - Found in {}".format(
|
||
character, text
|
||
)
|
||
)
|
||
invalid_characters_found = True
|
||
pos = pos + 1
|
||
else:
|
||
# advance to the end of the code and continue checking
|
||
pos = code_end_pos
|
||
else:
|
||
pos = pos + 1
|
||
return invalid_characters_found, text
|
||
|
||
|
||
# Iterate through the translations making sure there are no characters that are not allowed
|
||
text_files = glob.glob("./game/assets/jak2/text/*.json")
|
||
|
||
for text_file in text_files:
|
||
print("Checking {}".format(text_file))
|
||
with open(text_file, encoding="utf-8") as f:
|
||
file_data = json.load(f)
|
||
for id, text in file_data.items():
|
||
invalid_chars_exist, new_text = lint_jak2_characters(text)
|
||
if args.fix:
|
||
file_data[id] = new_text
|
||
if invalid_chars_exist:
|
||
invalid_characters_found = True
|
||
if args.fix:
|
||
# save the modified file back out
|
||
with open(text_file, "w", encoding="utf-8") as f:
|
||
json.dump(file_data, f, indent=2, ensure_ascii=False)
|
||
f.write("\n")
|
||
|
||
# subtitle_files = glob.glob("./game/assets/jak2/subtitle/*lines*.json")
|
||
|
||
# for subtitle_file in subtitle_files:
|
||
# print("Checking {}...".format(subtitle_file))
|
||
# with open(subtitle_file, encoding="utf-8") as f:
|
||
# file_data = json.load(f)
|
||
# # Check Speakers
|
||
# for id, text in file_data["speakers"].items():
|
||
# invalid_chars_exist, new_text = lint_jak2_characters(text)
|
||
# if args.fix and new_text != text:
|
||
# file_data["speakers"][id] = new_text
|
||
# if invalid_chars_exist:
|
||
# invalid_characters_found = True
|
||
# # Check Lines
|
||
# for id, lines in file_data["cutscenes"].items():
|
||
# for i, line in enumerate(lines):
|
||
# invalid_chars_exist, new_text = lint_jak2_characters(line)
|
||
# if args.fix and new_text != line:
|
||
# lines[i] = new_text
|
||
# if invalid_chars_exist:
|
||
# invalid_characters_found = True
|
||
# for id, lines in file_data["hints"].items():
|
||
# for i, line in enumerate(lines):
|
||
# invalid_chars_exist, new_text = lint_jak2_characters(line)
|
||
# if args.fix and new_text != line:
|
||
# lines[i] = new_text
|
||
# if invalid_chars_exist:
|
||
# invalid_characters_found = True
|
||
# if args.fix:
|
||
# # save the modified file back out
|
||
# with open(subtitle_file, "w", encoding="utf-8") as f:
|
||
# json.dump(file_data, f, indent=2, ensure_ascii=False)
|
||
# f.write("\n")
|
||
|
||
if invalid_characters_found:
|
||
print("Invalid characters were found, see above")
|
||
exit(1)
|
||
else:
|
||
print("No invalid characters found!")
|