A Demo Project for the UnrealEngineSDK
Loading...
Searching...
No Matches
DlgTwineToJsonHumanText.py
Go to the documentation of this file.
1#!/usr/bin/env python3
2# Copyright 2017-2018 Csaba Molnar, Daniel Butum
3
4import os
5import sys
6import argparse
7import json
8import uuid
9import re
10from pprint import pprint
11
12from html.parser import HTMLParser
13from html.entities import name2codepoint
14
15# NOTE: This script is standalone does not include any libraries
16
17DLG_JSON_HUMAN_EXTENSION = ".dlg_human.json"
18ROOT_NODE_INDEX = -1
19
20
21class Colors:
22 HEADER = '\033[95m'
23
24 BLUE = '\033[0;36m'
25 BLUE_LIGHT = '\033[1;36m'
26
27 GREEN = '\033[0;32m'
28 GREEN_LIGHT = '\033[1;32m'
29
30 YELLOW = '\033[0;33m'
31 YELLOW_LIGHT = '\033[1;33m'
32
33 RED = '\033[0;31m'
34 RED_LIGHT = '\033[1;31m'
35
36 # No Color
37 END = '\033[0m'
38
39
40def print_newlines(nr = 1):
41 if nr > 0:
42 print('\n' * nr, end='')
43
44
46 if sys.stdout.isatty():
47 print(Colors.END)
48
49
50def _print_internal(color, string, **kwargs):
51 if sys.stdout.isatty():
52 # You're running in a real terminal
53 prefix, suffix = color, Colors.END
54 else:
55 # You're being piped or redirected
56 prefix, suffix = '', ''
57
58 print(prefix + string + suffix, **kwargs)
59
60
61def print_red(*args, **kwargs):
62 _print_internal(Colors.RED, " ".join(map(str, args)), **kwargs)
63
64
65def print_red_light(*args, **kwargs):
66 _print_internal(Colors.RED_LIGHT, " ".join(map(str, args)), **kwargs)
67
68
69def print_blue(*args, **kwargs):
70 _print_internal(Colors.BLUE, " ".join(map(str, args)), **kwargs)
71
72
73def print_blue_light(*args, **kwargs):
74 _print_internal(Colors.BLUE_LIGHT, " ".join(map(str, args)), **kwargs)
75
76
77def print_yellow(*args, **kwargs):
78 _print_internal(Colors.YELLOW, " ".join(map(str, args)), **kwargs)
79
80
81def print_yellow_light(*args, **kwargs):
82 _print_internal(Colors.YELLOW_LIGHT, " ".join(map(str, args)), **kwargs)
83
84
85def print_green(*args, **kwargs):
86 _print_internal(Colors.GREEN, " ".join(map(str, args)), **kwargs)
87
88
89def print_green_light(*args, **kwargs):
90 _print_internal(Colors.GREEN_LIGHT, " ".join(map(str, args)), **kwargs)
91
92
93def print_config_value(config_name, config_value):
94 print_blue("{} = ".format(config_name), end='')
95 print_blue_light(config_value)
96
97
98def string_to_int(string):
99 try:
100 return int(string)
101 except ValueError:
102 return None
103
104
106 NODE_START = "node-start"
107 NODE_END = "node-end"
108 NODE_VIRTUAL_PARENT = "node-virtual-parent"
109 NODE_SPEECH = "node-speech"
110 NODE_SPEECH_SEQUENCE = "node-speech-sequence"
111 NODE_SELECTOR_FIRST = "node-selector-first"
112 NODE_SELECTOR_RANDOM = "node-selector-random"
113
114 @classmethod
118 @classmethod
119 def is_valid_tag(cls, tag):
120 return tag.lower() in cls.get_all_tags()
121
122 @classmethod
123 def has_valid_tags(cls, tags_list):
124 tags_set = set([x.lower() for x in tags_list])
125 common = cls.get_all_tags().intersection(tags_set)
126 return bool(common)
127
128
130 REGEX_NAME = r"(-?\d+)\.\s*(.*)"
131
132 @classmethod
133 def parse_twine_node_name(cls, raw_name, context_multiple_matches, context_invalid_index, context_invalid_speaker):
134 # Get node index and speaker
135 matches_name = re.finditer(cls.REGEX_NAME, raw_name, re.MULTILINE | re.UNICODE)
136 node_index, speaker = None, None
137 for index, match in enumerate(matches_name):
138 if index > 0:
139 print_yellow("{}, got multiple name matches".format(context_multiple_matches))
140 break
141
142 group_index = match.group(1)
143 if group_index is not None:
144 node_index = string_to_int(group_index.strip())
145 else:
146 print_yellow("{}, could not get node index from <node index>. <Speaker>".format(context_invalid_index))
147
148 group_speaker = match.group(2)
149 if group_index is not None:
150 speaker = group_speaker.strip()
151 else:
152 print_yellow("{}, could not get speaker from <node index>. <Speaker>".format(context_invalid_speaker))
153
154 return node_index, speaker
155
156 @classmethod
157 def clean_text(cls, text):
158 # Use windows line endings
159 return text.strip().replace("\n", "\r\n")
160
161
163 IGNORE_EMPTY_TEXT_FLAG = "~ignore~"
164
165 def __init__(self):
166 self.raw_data = None
167 self.raw_text = None
168
169 self.text = None
172
173 # The edge has empty text
175 return self.raw_text is None or self.IGNORE_EMPTY_TEXT_FLAG in self.raw_text.lower()
176
177 def parse(self):
178 # TODO make sure there are not multiple of these
179 parts = self.raw_data.split("|")
180 if len(parts) != 2:
181 print_yellow("Node Index = {} has an edge with len(parts) = {}. There must be exactly 2. Did you use `|` inside your edge?".format(self.owner_node_index, len(parts)))
182 return
183
184 # Text
185 self.raw_text = parts[0]
186 if self.is_empty_edge_text():
187 self.text = ""
188 else:
189 self.text = TwineHelper.clean_text(self.raw_text)
190
191 # Target nnode index
192 context_parse_name = "Node Index = {} Edge, parts[1] = `{}`".format(self.owner_node_index, parts[1])
193 self.target_node_index, ignored_speaker = TwineHelper.parse_twine_node_name(parts[1], context_parse_name, context_parse_name, context_parse_name)
194
195
196 def to_dict(self):
197 if self.text is None or self.target_node_index is None or self.target_node_index < ROOT_NODE_INDEX:
198 print(self.text)
199 print_yellow("Node index = {}, Edge invalid = {}. ignoring.".format(self.owner_node_index, str(self)))
200 return {}
201
202 return {
203 "TargetNodeIndex": self.target_node_index,
204 "Text": self.text
205 }
206
207 def __str__(self):
208 return "TwineEdgeData(target_node_index = {}, text = `{}`)".format(self.target_node_index, self.text)
209
210 def __repr__(self):
211 return str(self)
212
213
215 REGEX_SPEAKER = r"``\s*Speaker\s*:\s*``\s*//(.*)//"
216 REGEX_TEXT = r"``\s*Text\s*:\s*``\s*//(.*)//"
217 REGEX_EDGE_TEXT = r"``\s*EdgeText\s*:\s*``\s*//(.*)//"
218
219 def __init__(self):
220 self.raw_data = None
221
222 self.speaker = None
223 self.text = None
224 self.edge_text = None
226
227 def parse(self):
228 # Parse speaker
229 matches_text = re.finditer(self.REGEX_SPEAKER, self.raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
230 for index, match in enumerate(matches_text):
231 if index > 0:
232 print_yellow("Node speech sequence Index = {} got multiple matches for Speaker".format(self.owner_node_index))
233 break
234
235 group = match.group(1)
236 if group is None:
237 print_yellow("Node speech sequence Index = {} could not get group 1 that matches ``Speaker:`` //<Name>//".format(self.owner_node_index))
238 continue
239
240 self.speaker = group.strip()
241
242 # Parse text
243 matches_text = re.finditer(self.REGEX_TEXT, self.raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
244 for index, match in enumerate(matches_text):
245 if index > 0:
246 print_yellow("Node speech sequence Index = {} got multiple matches for Text".format(self.owner_node_index))
247 break
248
249 group = match.group(1)
250 if group is None:
251 print_yellow("Node speech sequence Index = {} could not get group 1 that matches ``Text:`` //<text>//".format(self.owner_node_index))
252 continue
253
254 self.text = TwineHelper.clean_text(group.strip())
255
256 # Parse edge text
257 matches_edge_text = re.finditer(self.REGEX_EDGE_TEXT, self.raw_data, re.MULTILINE | re.UNICODE | re.IGNORECASE)
258 for index, match in enumerate(matches_edge_text):
259 if index > 0:
260 print_yellow("Node speech sequence Index = {} got multiple matches for edge text".format(self.owner_node_index))
261 break
262
263 group = match.group(1)
264 if group is None:
265 print_yellow("Node speech sequence Index = {} could not get group 1 that matches ``EdgeText:`` //<edge_text>//".format(self.owner_node_index))
266 continue
267 self.edge_text = group.strip()
268
269 def to_dict(self):
270 if self.speaker is None or self.raw_data is None or self.text is None or self.edge_text is None:
271 return {}
272
273 return {
274 "Speaker": self.speaker,
275 "Text": self.text,
276 "EdgeText": self.edge_text
277 }
278
279
280 def __str__(self):
281 return "TwineInnerEdgeData(speaker = {}, text = {}, edge_text = `{}`)".format(self.speaker, self.text, self.edge_text)
282
283 def __repr__(self):
284 return str(self)
285
286
288 REGEX_EDGES = r"\[\[(.*)\]\]"
289
290 def __init__(self):
291 self.raw_name = None
292 self.raw_data = None
293 self.raw_tags = None
294
295 # Computed from raw data
296 self.node_index = None
297 self.speaker = None
298 self.text = ""
299 self.tags = []
300 self.edges = []
301 self.inner_edges = []
302
304 index_edge_start = self.raw_data.find("[[")
305 if index_edge_start == -1:
306 # take whole string
307 return self.raw_data
308
309 # Until the first
310 return self.raw_data[0:index_edge_start]
311
312 def _parse_text(self):
313 if not self.can_have_text():
314 return
315
316 self.text = TwineHelper.clean_text(self.__get_raw_data_until_edges())
317
318 def _parse_edges(self):
319 # Refuse to parse, because on some nodes we don't care about the edge text
320 if not self.raw_data or not self.can_have_text_on_edges():
321 return None
322
323 matches = re.finditer(self.REGEX_EDGES, self.raw_data, re.MULTILINE | re.UNICODE)
324 for index, match in enumerate(matches):
325 group = match.group(1)
326 if group is None:
327 print_yellow("Node Index = {} could not get group 1 that matches [[<edge content>|<edge index>]]".format(self.node_index))
328 continue
329
330 edge = TwineEdgeData()
331 edge.raw_data = group.strip()
332 edge.owner_node_index = self.node_index
333 edge.parse()
334 self.edges.append(edge)
335
336 # only for speech sequence nodese
338 if not self.is_node_speech_sequence() or not self.raw_data:
339 return
340
341 raw_text_data = self.__get_raw_data_until_edges().strip()
342 inner_edges_parts = raw_text_data.split("---")
343 if not inner_edges_parts:
344 print_yellow("Node Index = {} which is a speech sequence node does not have inner edges".format(self.node_index))
345 return
346
347 for raw_inner_edge in inner_edges_parts:
348 inner_edge = TwineInnerEdgeData()
349 inner_edge.raw_data = raw_inner_edge.strip()
350 inner_edge.owner_node_index = self.node_index
351 inner_edge.parse()
352 self.inner_edges.append(inner_edge)
353
354 def parse(self):
355 self.tags = [x.lower() for x in self.raw_tags.strip().split(" ")]
356
357 # Get node index and speaker
358 context_parse_name = "Node Name = {}".format(self.raw_name)
359 self.node_index, self.speaker = TwineHelper.parse_twine_node_name(self.raw_name, context_parse_name, context_parse_name, context_parse_name)
360
361 self._parse_text()
362 if not TwineNodeTag.has_valid_tags(self.tags):
363 print_yellow("Node Index = {} does not have any valid tags = {}".format(self.node_index, self.tags))
364
365 self._parse_edges()
366 self._parse_inner_edges()
367
368 def can_have_text(self):
369 return self.is_node_speech() or self.is_node_virtual_parent()
370
372 return self.is_node_start() or self.is_node_speech() or self.is_node_speech_sequence()
373
374 def is_node_start(self):
375 return TwineNodeTag.NODE_START in self.tags
376
377 def is_node_end(self):
378 return TwineNodeTag.NODE_END in self.tags
379
380 def is_node_speech(self):
381 return TwineNodeTag.NODE_SPEECH in self.tags
382
384 return TwineNodeTag.NODE_VIRTUAL_PARENT in self.tags
385
387 return TwineNodeTag.NODE_SPEECH_SEQUENCE in self.tags
388
390 return self.is_node_selector_first() or self.is_node_selector_random()
391
393 return TwineNodeTag.NODE_SELECTOR_FIRST in self.tags
394
396 return TwineNodeTag.NODE_SELECTOR_RANDOM in self.tags
397
398 def to_dict(self):
399 if self.node_index is None or self.node_index < ROOT_NODE_INDEX:
400 print_yellow("Node Index = {} is invalid ignoring".format(self.node_index))
401 return {}
402
403 edges = []
404 for edge in self.edges:
405 edges.append(edge.to_dict())
406
407 inner_edges = []
408 for inner_edge in self.inner_edges:
409 inner_edges.append(inner_edge.to_dict())
410
411 if self.is_node_speech_sequence():
412 return {
413 "NodeIndex": self.node_index,
414 "Speaker": self.speaker,
415 "Sequence": inner_edges,
416 "Edges": edges
417 }
418
419 if self.can_have_text() or self.is_node_start():
420 return {
421 "NodeIndex": self.node_index,
422 "Speaker": self.speaker,
423 "Text": self.text,
424 "Edges": edges
425 }
426
427 return {}
428
429 def __str__(self):
430 return "TwineNodeData(node_index = {}, speakr = {}, tags = {}, text = `{}`, edges = {})".format(self.node_index, self.speaker, self.tags, self.text, self.edges)
431
432 def __repr__(self):
433 return str(self)
434
435
437 def __init__(self):
438 self.raw_guid = None
439
440 self.dialogue_name = None
441 self.dialogue_guid = None
442 self.nodes = []
443
445 # Convert to default Unreal uuid
446 temp_uuid = uuid.UUID(self.raw_guid)
447 self.dialogue_guid = temp_uuid.hex.upper()
448
449 def parse(self):
451
452 def to_dict(self):
453 if self.dialogue_name is None or self.dialogue_guid is None or not self.nodes:
454 return {}
455
456 speech_nodes = []
457 speech_sequence_nodes = []
458 for node in self.nodes:
459 if node.is_node_speech_sequence():
460 speech_sequence_nodes.append(node.to_dict())
461 elif node.is_node_speech() or node.is_node_virtual_parent() or node.is_node_start():
462 speech_nodes.append(node.to_dict())
463 else:
464 # Ignore
465 pass
466
467 return {
468 "DialogueName": self.dialogue_name,
469 "DialogueGUID": self.dialogue_guid,
470 "SpeechNodes": speech_nodes,
471 "SpeechSequenceNodes": speech_sequence_nodes
472 }
473
474 def __str__(self):
475 return "TwineDocumentData(dialogue_name = {}, dialogue_guid = {}, nodes =\n{})".format(self.dialogue_name, self.dialogue_guid, "\n".join(str(n) for n in self.nodes))
476
477 def __repr__(self):
478 return str(self)
479
480
481class TwineHtmlParser(HTMLParser):
482 HTML_TAG_STORYDATA = "tw-storydata"
483 HTML_TAG_PASSAGE_DATA = "tw-passagedata"
484
485 HTML_ATTR_NAME = "name"
486 HTML_ATTR_TAGS = "tags"
487 HTML_ATTR_GUID = "ifid"
488
489 def __init__(self):
490 super().__init__()
492 self.current_tag = None
493 self.current_node = None
494
495 def handle_starttag(self, tag, attrs):
496 # print("Start tag:", tag)
497 self.current_tag = tag
498 if tag == self.HTML_TAG_STORYDATA:
499 # Data about dialogue
500 for attr in attrs:
501 attr_name, attr_value = attr
502 if attr_name == self.HTML_ATTR_NAME:
503 self.document.dialogue_name = attr_value.strip()
504 elif attr_name == self.HTML_ATTR_GUID:
505 self.document.raw_guid = attr_value.strip()
506
507 elif tag == self.HTML_TAG_PASSAGE_DATA:
508 # Data about each node
510 self.document.nodes.append(self.current_node)
511
512 for attr in attrs:
513 attr_name, attr_value = attr
514 if attr_name == self.HTML_ATTR_NAME:
515 self.current_node.raw_name = attr_value.strip()
516 elif attr_name == self.HTML_ATTR_TAGS:
517 self.current_node.raw_tags = attr_value.strip()
518
519 def handle_endtag(self, tag):
520 if tag == self.HTML_TAG_STORYDATA:
521 self.document.parse()
522 elif tag == self.HTML_TAG_PASSAGE_DATA:
523 self.current_node.parse()
524
525 self.current_tag = None
526 self.current_node = None
527 # print("End tag :", tag)
528
529 def handle_data(self, data):
530 if self.current_tag is None:
531 return
532 if self.current_node is None:
533 return
534
535 if self.current_tag == self.HTML_TAG_PASSAGE_DATA:
536 self.current_node.raw_data = data.strip()
537
538 def handle_comment(self, data):
539 print("Comment :", data)
540
541 def handle_entityref(self, name):
542 c = chr(name2codepoint[name])
543 print("Named ent:", c)
544
545 def handle_charref(self, name):
546 if name.startswith('x'):
547 c = chr(int(name[1:], 16))
548 else:
549 c = chr(int(name))
550 print("Num ent :", c)
551
552 def handle_decl(self, data):
553 print("Decl :", data)
554
555
556def exit_program(status):
557 sys.exit(status)
558
559
560def exit_program_error(message=None):
561 if message is not None:
562 print_red(message)
563 exit_program(1)
564
565
567 exit_program(0)
568
569
571 if not os.path.isabs(path):
572 return os.path.abspath(path)
573
574 return path
575
576
578 if not os.path.isfile(path):
579 return False
580
581 filename = os.path.basename(str(path))
582 file, extension = os.path.splitext(filename)
583
584 if extension != ".html":
585 return False
586
587 # TODO Maybe parse the contents
588
589 return True
590
591
592def json_save_dictionary(path, dictionary):
593 try:
594 with open(path, 'w') as fh:
595 try:
596 json.dump(dictionary, fh, indent=4)
597 except ValueError as e:
598 print_red("Can't save file = `{}`. Error = `{}`".format(path, e))
599 return None
600 except IOError as e:
601 print_red("Can't open file = `{}`. IOError = `{}`".format(path, e))
602
603
605 """
606 Returns a dictionary
607 """
608 try:
609 with open(path, 'r', encoding="utf8") as fh:
610 parser = TwineHtmlParser()
611 parser.feed(fh.read())
612 return parser.document
613 except IOError as e:
614 print_red("Can't open file = `{}`. IOError = `{}`".format(path, e))
615 return None
616
617
618def export_twine_file_dlg_text_json(src_file_path, src_twine_dir_from, dst_json_dir):
619 # Construct subdirectory we need to create our destination file
620 src_dirname, src_filename = os.path.split(src_file_path)
621
622 src_dirname_parts = src_dirname.split(os.sep)
623 dst_dirname = None
624 for index, part in enumerate(src_dirname_parts):
625 if part == src_twine_dir_from:
626 dst_dirname = os.sep.join(src_dirname_parts[index + 1:])
627 break
628
629 if dst_dirname is None:
630 print_yellow("Can't find dst_dirname for src_file_path = `{}`".format(src_file_path))
631 return
632
633 # Ensure dirname exists in destination
634 dst_dirname = os.path.join(dst_json_dir, dst_dirname)
635 if not os.path.exists(dst_dirname):
636 os.makedirs(dst_dirname, exist_ok=True)
637 print_blue("Creating directory = `{}`".format(dst_dirname))
638 if not os.path.isdir(dst_json_dir):
639 print_yellow("Path = `{}` is not a directory. Ignoring".format(dst_dirname))
640 return
641
642 # Parse file
643 print_blue("Parsing file = `{}`".format(src_file_path))
644 twine_document = twine_parse_file(src_file_path)
645 if twine_document is None:
646 print_yellow("Can't parse twine file = `{}`".format(src_file_path))
647 return
648
649 #print(twine_document)
650 #print(twine_document.to_dict())
651
652 json_human_content = twine_document.to_dict()
653 if not json_human_content:
654 print_yellow("Twine file = `{}` is corrupt or invalid. Can't parse any data".format(src_file_path))
655 return
656
657 # Write file
658 src_file, src_file_ext = os.path.splitext(src_filename)
659 dst_file_path = os.path.join(dst_dirname, src_file) + DLG_JSON_HUMAN_EXTENSION
660 print_blue("Writing file = `{}`".format(dst_file_path))
661 json_save_dictionary(dst_file_path, json_human_content)
662 print("")
663
664
665
666def main(src_twine_dir, dst_json_dir):
667 if not os.path.exists(src_twine_dir):
668 exit_program_error("src_twine_dir = `{}` does not exist".format(src_twine_dir))
669 if not os.path.isdir(src_twine_dir):
670 exit_program_error("src_twine_dir = `{}` is not a directory".format(src_twine_dir))
671
672 if not os.path.exists(dst_json_dir):
673 os.makedirs(dst_json_dir, exist_ok=True)
674 print_blue("Creating dst_json_dir = `{}`".format(dst_json_dir))
675 if not os.path.isdir(dst_json_dir):
676 exit_program_error("dst_json_dir = `{}` is not a directory".format(dst_json_dir))
677
678 # Walk over all files in directory
679 src_twine_dir = convert_path_to_absolute_if_not_already(src_twine_dir)
680 dst_json_dir = convert_path_to_absolute_if_not_already(dst_json_dir)
681 print_blue("Finding save files in src_twine_dir = {}\n".format(src_twine_dir))
682
683 # Directory from where files
684 src_twine_dir_from = os.path.basename(os.path.normpath(src_twine_dir))
685 for path, subdirs, files in os.walk(src_twine_dir):
686 for name in files:
687 full_filename = os.path.join(path, name)
688 if is_path_twine_file(full_filename):
689 export_twine_file_dlg_text_json(full_filename, src_twine_dir_from, dst_json_dir)
690 else:
691 print_yellow("Path = `{}` is not a file or a twine file".format(full_filename))
692
693
694if __name__ == "__main__":
695 parser = argparse.ArgumentParser()
696 parser.add_argument('src_twine_dir', nargs='?', type=str, help='Source directory from where we get all the .html twine files', default="DialoguesTwine/")
697 parser.add_argument('dst_json_dir', nargs='?', type=str, help='Destination directory where we store all the .dlg_human.json files', default="DialoguesJsonHumanText/")
698
699 args = parser.parse_args()
700 main(args.src_twine_dir, args.dst_json_dir)
parse_twine_node_name(cls, raw_name, context_multiple_matches, context_invalid_index, context_invalid_speaker)
main(src_twine_dir, dst_json_dir)
export_twine_file_dlg_text_json(src_file_path, src_twine_dir_from, dst_json_dir)
print_config_value(config_name, config_value)
_print_internal(color, string, **kwargs)