10from pprint
import pprint
12from html.parser
import HTMLParser
13from html.entities
import name2codepoint
17DLG_JSON_HUMAN_EXTENSION =
".dlg_human.json"
25 BLUE_LIGHT =
'\033[1;36m'
28 GREEN_LIGHT =
'\033[1;32m'
31 YELLOW_LIGHT =
'\033[1;33m'
34 RED_LIGHT =
'\033[1;31m'
42 print(
'\n' * nr, end=
'')
46 if sys.stdout.isatty():
51 if sys.stdout.isatty():
53 prefix, suffix = color, Colors.END
56 prefix, suffix =
'',
''
58 print(prefix + string + suffix, **kwargs)
82 _print_internal(Colors.YELLOW_LIGHT,
" ".join(map(str, args)), **kwargs)
94 print_blue(
"{} = ".format(config_name), end=
'')
106 NODE_START =
"node-start"
107 NODE_END =
"node-end"
108 NODE_VIRTUAL_PARENT =
"node-virtual-parent"
109 NODE_SPEECH =
"node-speech"
110 NODE_SPEECH_SEQUENCE =
"node-speech-sequence"
111 NODE_SELECTOR_FIRST =
"node-selector-first"
112 NODE_SELECTOR_RANDOM =
"node-selector-random"
124 tags_set = set([x.lower()
for x
in tags_list])
130 REGEX_NAME =
r"(-?\d+)\.\s*(.*)"
135 matches_name = re.finditer(cls.
REGEX_NAME, raw_name, re.MULTILINE | re.UNICODE)
136 node_index, speaker =
None,
None
137 for index, match
in enumerate(matches_name):
139 print_yellow(
"{}, got multiple name matches".format(context_multiple_matches))
142 group_index = match.group(1)
143 if group_index
is not None:
146 print_yellow(
"{}, could not get node index from <node index>. <Speaker>".format(context_invalid_index))
148 group_speaker = match.group(2)
149 if group_index
is not None:
150 speaker = group_speaker.strip()
152 print_yellow(
"{}, could not get speaker from <node index>. <Speaker>".format(context_invalid_speaker))
154 return node_index, speaker
159 return text.strip().replace(
"\n",
"\r\n")
163 IGNORE_EMPTY_TEXT_FLAG =
"~ignore~"
181 print_yellow(
"Node Index = {} has an edge with len(parts) = {}. There must be exactly 2. Did you use `|` inside your edge?".format(self.
owner_node_index, len(parts)))
192 context_parse_name =
"Node Index = {} Edge, parts[1] = `{}`".format(self.
owner_node_index, parts[1])
193 self.
target_node_index, ignored_speaker = TwineHelper.parse_twine_node_name(parts[1], context_parse_name, context_parse_name, context_parse_name)
208 return "TwineEdgeData(target_node_index = {}, text = `{}`)".format(self.
target_node_index, self.
text)
215 REGEX_SPEAKER =
r"``\s*Speaker\s*:\s*``\s*//(.*)//"
216 REGEX_TEXT =
r"``\s*Text\s*:\s*``\s*//(.*)//"
217 REGEX_EDGE_TEXT =
r"``\s*EdgeText\s*:\s*``\s*//(.*)//"
229 matches_text = re.finditer(self.
REGEX_SPEAKER, self.
raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
230 for index, match
in enumerate(matches_text):
235 group = match.group(1)
237 print_yellow(
"Node speech sequence Index = {} could not get group 1 that matches ``Speaker:`` //<Name>//".format(self.
owner_node_index))
243 matches_text = re.finditer(self.
REGEX_TEXT, self.
raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
244 for index, match
in enumerate(matches_text):
249 group = match.group(1)
254 self.
text = TwineHelper.clean_text(group.strip())
257 matches_edge_text = re.finditer(self.
REGEX_EDGE_TEXT, self.
raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
258 for index, match
in enumerate(matches_edge_text):
263 group = match.group(1)
265 print_yellow(
"Node speech sequence Index = {} could not get group 1 that matches ``EdgeText:`` //<edge_text>//".format(self.
owner_node_index))
281 return "TwineInnerEdgeData(speaker = {}, text = {}, edge_text = `{}`)".format(self.
speaker, self.
text, self.
edge_text)
288 REGEX_EDGES =
r"\[\[(.*)\]\]"
304 index_edge_start = self.
raw_data.find(
"[[")
305 if index_edge_start == -1:
310 return self.
raw_data[0:index_edge_start]
324 for index, match
in enumerate(matches):
325 group = match.group(1)
327 print_yellow(
"Node Index = {} could not get group 1 that matches [[<edge content>|<edge index>]]".format(self.
node_index))
331 edge.raw_data = group.strip()
334 self.
edges.append(edge)
342 inner_edges_parts = raw_text_data.split(
"---")
343 if not inner_edges_parts:
344 print_yellow(
"Node Index = {} which is a speech sequence node does not have inner edges".format(self.
node_index))
347 for raw_inner_edge
in inner_edges_parts:
349 inner_edge.raw_data = raw_inner_edge.strip()
355 self.
tags = [x.lower()
for x
in self.
raw_tags.strip().split(
" ")]
358 context_parse_name =
"Node Name = {}".format(self.
raw_name)
359 self.
node_index, self.
speaker = TwineHelper.parse_twine_node_name(self.
raw_name, context_parse_name, context_parse_name, context_parse_name)
362 if not TwineNodeTag.has_valid_tags(self.
tags):
375 return TwineNodeTag.NODE_START
in self.
tags
378 return TwineNodeTag.NODE_END
in self.
tags
381 return TwineNodeTag.NODE_SPEECH
in self.
tags
384 return TwineNodeTag.NODE_VIRTUAL_PARENT
in self.
tags
387 return TwineNodeTag.NODE_SPEECH_SEQUENCE
in self.
tags
393 return TwineNodeTag.NODE_SELECTOR_FIRST
in self.
tags
396 return TwineNodeTag.NODE_SELECTOR_RANDOM
in self.
tags
404 for edge
in self.
edges:
405 edges.append(edge.to_dict())
409 inner_edges.append(inner_edge.to_dict())
415 "Sequence": inner_edges,
430 return "TwineNodeData(node_index = {}, speakr = {}, tags = {}, text = `{}`, edges = {})".format(self.
node_index, self.
speaker, self.
tags, self.
text, self.
edges)
446 temp_uuid = uuid.UUID(self.
raw_guid)
457 speech_sequence_nodes = []
458 for node
in self.
nodes:
459 if node.is_node_speech_sequence():
460 speech_sequence_nodes.append(node.to_dict())
461 elif node.is_node_speech()
or node.is_node_virtual_parent()
or node.is_node_start():
462 speech_nodes.append(node.to_dict())
470 "SpeechNodes": speech_nodes,
471 "SpeechSequenceNodes": speech_sequence_nodes
475 return "TwineDocumentData(dialogue_name = {}, dialogue_guid = {}, nodes =\n{})".format(self.
dialogue_name, self.
dialogue_guid,
"\n".join(str(n)
for n
in self.
nodes))
482 HTML_TAG_STORYDATA =
"tw-storydata"
483 HTML_TAG_PASSAGE_DATA =
"tw-passagedata"
485 HTML_ATTR_NAME =
"name"
486 HTML_ATTR_TAGS =
"tags"
487 HTML_ATTR_GUID =
"ifid"
501 attr_name, attr_value = attr
503 self.
document.dialogue_name = attr_value.strip()
505 self.
document.raw_guid = attr_value.strip()
513 attr_name, attr_value = attr
539 print(
"Comment :", data)
542 c = chr(name2codepoint[name])
543 print(
"Named ent:", c)
546 if name.startswith(
'x'):
547 c = chr(int(name[1:], 16))
550 print(
"Num ent :", c)
553 print(
"Decl :", data)
561 if message
is not None:
571 if not os.path.isabs(path):
572 return os.path.abspath(path)
578 if not os.path.isfile(path):
581 filename = os.path.basename(str(path))
582 file, extension = os.path.splitext(filename)
584 if extension !=
".html":
594 with open(path,
'w')
as fh:
596 json.dump(dictionary, fh, indent=4)
597 except ValueError
as e:
598 print_red(
"Can't save file = `{}`. Error = `{}`".format(path, e))
601 print_red(
"Can't open file = `{}`. IOError = `{}`".format(path, e))
609 with open(path,
'r', encoding=
"utf8")
as fh:
611 parser.feed(fh.read())
612 return parser.document
614 print_red(
"Can't open file = `{}`. IOError = `{}`".format(path, e))
620 src_dirname, src_filename = os.path.split(src_file_path)
622 src_dirname_parts = src_dirname.split(os.sep)
624 for index, part
in enumerate(src_dirname_parts):
625 if part == src_twine_dir_from:
626 dst_dirname = os.sep.join(src_dirname_parts[index + 1:])
629 if dst_dirname
is None:
630 print_yellow(
"Can't find dst_dirname for src_file_path = `{}`".format(src_file_path))
634 dst_dirname = os.path.join(dst_json_dir, dst_dirname)
635 if not os.path.exists(dst_dirname):
636 os.makedirs(dst_dirname, exist_ok=
True)
637 print_blue(
"Creating directory = `{}`".format(dst_dirname))
638 if not os.path.isdir(dst_json_dir):
639 print_yellow(
"Path = `{}` is not a directory. Ignoring".format(dst_dirname))
643 print_blue(
"Parsing file = `{}`".format(src_file_path))
645 if twine_document
is None:
646 print_yellow(
"Can't parse twine file = `{}`".format(src_file_path))
652 json_human_content = twine_document.to_dict()
653 if not json_human_content:
654 print_yellow(
"Twine file = `{}` is corrupt or invalid. Can't parse any data".format(src_file_path))
658 src_file, src_file_ext = os.path.splitext(src_filename)
659 dst_file_path = os.path.join(dst_dirname, src_file) + DLG_JSON_HUMAN_EXTENSION
660 print_blue(
"Writing file = `{}`".format(dst_file_path))
666def main(src_twine_dir, dst_json_dir):
667 if not os.path.exists(src_twine_dir):
669 if not os.path.isdir(src_twine_dir):
672 if not os.path.exists(dst_json_dir):
673 os.makedirs(dst_json_dir, exist_ok=
True)
674 print_blue(
"Creating dst_json_dir = `{}`".format(dst_json_dir))
675 if not os.path.isdir(dst_json_dir):
681 print_blue(
"Finding save files in src_twine_dir = {}\n".format(src_twine_dir))
684 src_twine_dir_from = os.path.basename(os.path.normpath(src_twine_dir))
685 for path, subdirs, files
in os.walk(src_twine_dir):
687 full_filename = os.path.join(path, name)
691 print_yellow(
"Path = `{}` is not a file or a twine file".format(full_filename))
694if __name__ ==
"__main__":
695 parser = argparse.ArgumentParser()
696 parser.add_argument(
'src_twine_dir', nargs=
'?', type=str, help=
'Source directory from where we get all the .html twine files', default=
"DialoguesTwine/")
697 parser.add_argument(
'dst_json_dir', nargs=
'?', type=str, help=
'Destination directory where we store all the .dlg_human.json files', default=
"DialoguesJsonHumanText/")
699 args = parser.parse_args()
700 main(args.src_twine_dir, args.dst_json_dir)
_parse_dialogue_guid(self)
str IGNORE_EMPTY_TEXT_FLAG
parse_twine_node_name(cls, raw_name, context_multiple_matches, context_invalid_index, context_invalid_speaker)
str HTML_TAG_PASSAGE_DATA
handle_comment(self, data)
handle_starttag(self, tag, attrs)
handle_entityref(self, name)
handle_charref(self, name)
is_node_selector_random(self)
is_node_virtual_parent(self)
is_node_speech_sequence(self)
__get_raw_data_until_edges(self)
can_have_text_on_edges(self)
is_node_selector_first(self)
has_valid_tags(cls, tags_list)
print_red_light(*args, **kwargs)
convert_path_to_absolute_if_not_already(path)
exit_program_error(message=None)
print_blue_light(*args, **kwargs)
print_green_light(*args, **kwargs)
print_blue(*args, **kwargs)
print_yellow(*args, **kwargs)
main(src_twine_dir, dst_json_dir)
print_green(*args, **kwargs)
export_twine_file_dlg_text_json(src_file_path, src_twine_dir_from, dst_json_dir)
print_red(*args, **kwargs)
print_config_value(config_name, config_value)
print_yellow_light(*args, **kwargs)
json_save_dictionary(path, dictionary)
_print_internal(color, string, **kwargs)