diff --git a/examples/temporal_extraction/README.md b/examples/temporal_extraction/README.md new file mode 100644 index 00000000..efa116fc --- /dev/null +++ b/examples/temporal_extraction/README.md @@ -0,0 +1,20 @@ +This code provides a wrapper which sends texts through our temporal REST API. + +In one terminal, run the following to set up the REST entrypoint: + + cd cnlp_transformers/src/cnlpt/api + python temporal_rest.py + +In another, run the script in this folder, `extract_temporal.py`. The arguments are: +* Required: + * `-d`, `--data_dir`: directory in which to find the texts. There should be one text per file. + * `-o`, `--out_dir`: directory in which to save outputs +* Optional: + * `-u`, `--rest_url`: default `"http://0.0.0.0:8000/temporal/process"` + * `--input_format`: default `"json"` + * `--text_name`: default `"text"` + * `--output_format`: default `"json"` + +Note: if this is being run on a cluster like E2, these programs must be run on the same node. + +The post-processing code here was written for our temporal REST API, but can be modified to suit the outputs of our other REST APIs, e.g. negation detection. diff --git a/examples/temporal_extraction/conf/rush_rules.tsv b/examples/temporal_extraction/conf/rush_rules.tsv new file mode 100644 index 00000000..6070aa03 --- /dev/null +++ b/examples/temporal_extraction/conf/rush_rules.tsv @@ -0,0 +1,977 @@ +#/******************************************************************************* +# * Copyright 2016 Department of Biomedical Informatics, University of Utah +# *

+# * Licensed under the Apache License, Version 2.0 (the "License"); +# * you may not use this file except in compliance with the License. +# * You may obtain a copy of the License at +# *

+# * http://www.apache.org/licenses/LICENSE-2.0 +# *

+# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# *******************************************************************************/ + +#this list is optimized for shorter rule length rules for semeval were added +@MaxRepeatLength 100 +@Version 1.0.3.4 + +#stbegin is the marker for sentence begin, the span of sentence will start at the begin of the captured group +#stbegin has two scores 0, 1: 0 for true sentence begin clues, 1 for false sentence begin clues which will overwrite 0-scored rules when they are overlapping. +#stend is the marker for sentence end, the span of sentence will end at the end of the captured group +#stend also has two scores 2, 3: 2 for true sentence end clues, 3 for false sentence end clues which will overwrite 2-scored rules when they are overlapping + +# \b the begin of an input +# \e the end of an input +# \d A digit +# \C A capital letter +# \c A lowercase letter +# \s A character of whitespace or tab or Unicode code point 160 +# \w A character of \s above or Unicode code greater than '~' +# \a A Non-whitespace character +# \u A unusual character: Unicode code greater than '~' (excluding Unicode 160) +# \n A return ('\n' or '\r') +# ( Beginning of capturing a group +# ) End of capturing a group +# \p A punctuation +# +# \+ An addition symbol (to distinguish the "+" after a wildcard) +# \( A left parentheses symbol +# \) A right parentheses symbol +# +# A wildcard followed by a "+": 1 or more characters that match the wildcard +\b(\C 0 stbegin +\b(\d 0 stbegin +\b\s+(\C 0 stbegin +\b\s+(\d 0 stbegin +\c.\s+(\C) 0 stbegin + mL.\s+(\C) 0 stbegin +*) 1 stbegin +\c\c.\s+(\C) 0 stbegin +\c\).\s+(\C) 0 stbegin +\d\).\s+(\C) 0 stbegin +\C\C\C.\s+(\C)\c 0 stbegin +\d.\s+(\C) 0 stbegin +\n\n\s+(\C) 0 stbegin + Med\n\n\n+(\c+) 0 stbegin + Med\s+\n\n\n+\s+(\c+) 0 stbegin + Med\s+\n+\s+\n+\s+(\c+) 0 stbegin + Normal\s+\n+\s+\n+\s+(\c+) 0 stbegin +\c\c.\n+(\c\c 0 stbegin +\c\c.\n+(\d+ 0 stbegin +\d%.\n+(\d+ 0 stbegin +\c\c.\n\n\w+(\c+) 0 stbegin +\c\c.\s+\n\n\w+(\c+) 0 stbegin +\c\c.\s+\n\n\w+(\c+) 0 stbegin +\c\c.\n\w+(\c+) 0 stbegin +\c\c.\w+(\c+) 0 stbegin +\c.\n+(\d+)\s 0 stbegin +\c.\s+\n+(\d+)\s 0 stbegin +\c.\n+\s+(\d+)\s 0 stbegin +\c.\s+\n+(\d).\s+\d 0 stbegin +\c.\s+\n+\s+(\d+)\s+ 0 stbegin +\c.\n+(\d+).\d+x 0 stbegin +\c.\s+\n+(\d+).\d+x 0 stbegin +\c.\n+\s+(\d+).\d+x 0 stbegin +\c.\s+\n+\s+(\d+).\d+x 0 stbegin +\c.\n+(\d+).\d+* 0 stbegin +\c.\s+\n+(\d+).\d+* 0 stbegin +\c.\n+\s+(\d+).\d+* 0 stbegin +\c.\s+\n+\s+(\d+).\d+* 0 stbegin +\c.\n+(\d+)x 0 stbegin +\c.\s+\n+(\d+)x 0 stbegin +\c.\n+\s+(\d+)x 0 stbegin +\c.\s+\n+\s+(\d+)x 0 stbegin +\c.\n+(\d+)\s+\c 0 stbegin +\c.\s+\n+(\d+)\s+\c 0 stbegin +\c.\n+\s+(\d+)\s+\c 0 stbegin +\c.\s+\n+\s+(\d+)\s+\c 0 stbegin +\c.\n+(\d+)\c 0 stbegin +\c.\s+\n+(\d+)\c 0 stbegin +\c.\n+\s+(\d+)\c 0 stbegin +\c.\s+\n+\s+(\d+)\c 0 stbegin +\n\n\s\w+\d+.(\C) 0 stbegin +\n\n\s\w+\d+.\s+(\C) 0 stbegin +\n\n\s\w+(\d)\s 0 stbegin +\n\n\s\w+(\d+)\s 0 stbegin +\n\n\s\w+(")\C 0 stbegin +\n\n\d+.(\C) 0 stbegin +\n\d/\s+(\C) 0 stbegin +\n\n(\d+.\s+\C 0 stbegin +\n(\d+.\s+\c 0 stbegin +\n\n(\d)\s 0 stbegin +\n\n(\d+)\s 0 stbegin +\n\n(")\C 0 stbegin +\n\n(")\s+\C 0 stbegin +\n\n\s+(")\s+\C 0 stbegin +\n\n(-\C 0 stbegin +\n(-\C 0 stbegin +\c.\s+\n(-\C 0 stbegin +\c:\s+\n(-\C 0 stbegin +\c.\n(-\C 0 stbegin +\c:\n(-\C 0 stbegin +\n\n\s+(-\C 0 stbegin +\n\n\s+(-\s+\C 0 stbegin +\n\n(-\s+\C 0 stbegin + + +\n\n(-\s+\c)\c 0 stbegin +\n\n\s+-\s+(\c)\c 0 stbegin +\c.\n(-\c)\c 0 stbegin +\c:\n(-\c)\c 0 stbegin +\c.\s+\n(-\c) 0 stbegin +\c:\s+\n(-\c) 0 stbegin +\n(-\c)\c 0 stbegin +\n (• \c 0 stbegin + +\c.\n+(-\d) 0 stbegin +\c.\s+\n+(-\d) 0 stbegin +\c.\s+\n+\s+(-\d) 0 stbegin + +\n\n*(\C) 0 stbegin +\n\n\s+(*)\C 0 stbegin +\n\n\s+(*)\s+\C 0 stbegin +\n\n(*)\s+\C 0 stbegin +\n\n\s+(')\C 0 stbegin +\n\n(')\C 0 stbegin +\n\n(')\s+\C 0 stbegin +\n\n\s+(')\s+\C 0 stbegin +\n\n\s+(%)\C 0 stbegin +\n\n(%)\C 0 stbegin +\n\n(%)\s+\C 0 stbegin +\n\n\s+(%)\s+\C 0 stbegin +\n\n*\p+(\C 0 stbegin +\n\n*\p+\s+(\C 0 stbegin +\n**\p+\s+(\C 0 stbegin +\n**\s+(\C 0 stbegin +\n**\p+(\C 0 stbegin +\n**(\C 0 stbegin +\n**(\d 0 stbegin +\n\n\s+*\p+(\C 0 stbegin +\n\n\s+*\p+\s+\C 0 stbegin +\c.\s+**\p+(\C 0 stbegin + +\n\n\s+(\u)\s+\d\s 0 stbegin +\n\n\s+(\u)\s+\d+\s 0 stbegin +\n\n\s+(\u)\s+\d+/ 0 stbegin +\n\n\s+(\u)\s+\d/ 0 stbegin +\n\n\s+(\u)\s+\c 0 stbegin +\n\n\s+(\u)\s+\C 0 stbegin +\n\n(\u)\s+\C 0 stbegin +?\s+(\C)\c 0 stbegin +?\s+(\d 0 stbegin +!\s+(\C)\c 0 stbegin +!\s+(\d 0 stbegin + + + +#start with time +\n\n(\d):\d\s 0 stbegin +\n\n(\d):\d\d\s 0 stbegin +\n\n(\d)\d:\d\d\s 0 stbegin +\n\n(\d)\d:\d\s 0 stbegin +\n\n(\d):\d- 0 stbegin +\n\n(\d):\d\d- 0 stbegin +\n\n(\d)\d:\d\d- 0 stbegin +\n\n(\d)\d:\d- 0 stbegin +\n\n\w+(\d):\d\s 0 stbegin +\n\n\w+(\d):\d\d\s 0 stbegin +\n\n\w+(\d)\d:\d\d\s 0 stbegin +\n\n\w+(\d)\d:\d\s 0 stbegin +\n\n\w+(\d):\d- 0 stbegin +\n\n\w+(\d):\d\d- 0 stbegin +\n\n\w+(\d)\d:\d\d- 0 stbegin +\n\n\w+(\d)\d:\d- 0 stbegin +#start with dates +\n\n(\d+)\s+ 0 stbegin +\n\n(\d)\d/\d/\d\d\d\d 0 stbegin +\n\n(\d)/\d/\d\d\d\d 0 stbegin +\n\n(\d)\d/\d\d/\d\d\d\d 0 stbegin +\n\n(\d)/\d\d/\d\d\d\d 0 stbegin +\n\n(\d)\d/\d/\d\d 0 stbegin +\n\n(\d)/\d/\d\d 0 stbegin +\n\n(\d)\d/\d\d/\d\d 0 stbegin +\n\n(\d)/\d\d/\d\d 0 stbegin +\n\n(\d)\d/\d\s 0 stbegin +\n\n(\d)/\d\s 0 stbegin +\n\n(\d)\d/\d\d\s 0 stbegin +\n\n(\d)/\d\d/\d\s 0 stbegin +\n+\s\s\s\s(\C) 0 stbegin +\n+\s\s\s(\C) 0 stbegin +\n+\s\s(\C) 0 stbegin +\n+(\C) 0 stbegin +.\s+(N)ow 0 stbegin +.\s+(D)ischarge 0 stbegin + +\n(\(-\)\s+\C 0 stbegin +#\n(\(+\)\s+\C 0 stbegin +\n       (\d 0 stbegin +\C:\n+(\d 0 stbegin +\n(\d+).\s+\C 0 stbegin +\n(\d+.\C 0 stbegin +\n\s+(\d.\s+\C 0 stbegin +\n\s+(\d\d.\s+\C 0 stbegin +\n\d.\)\s+(\C 0 stbegin +\n\d\d.\)\s+(\C 0 stbegin +\c:\n+(\a 0 stbegin +\s+\s+(\d\)\s+\C 0 stbegin +\s+\s+(\d\d\)\s+\C 0 stbegin +\n) \d\d\) 2 stend + +\c:\n+(\d. 0 stbegin +\d:\n+(\d 0 stbegin + +\C:\s+\n+(\d 0 stbegin +\C:\s+\n+(1.  0 stbegin +\c:\s+\n+(\d 0 stbegin +\d:\s+\n+(\d 0 stbegin +\).\s+(\C 0 stbegin +\n(- \c 0 stbegin +\n(- \C 0 stbegin +\n(# \c 0 stbegin +\n(# \C 0 stbegin +\n(#\C 0 stbegin +\n(#\c 0 stbegin +\n(* \c 0 stbegin +\n(* \C 0 stbegin +\n(? \C 0 stbegin +\n(? \c 0 stbegin +\n(. \C 0 stbegin +\n(+ \C 0 stbegin +\n(/ \C 0 stbegin +\n+\d\d-\d\d\s+(\C 0 stbegin +\n+\d+-\d\d-\d\d\s+(\C 0 stbegin +\n+\d+-\d\d-\d\d\s+:\s+(\C 0 stbegin +\c.\s+\n(\d.\C 0 stbegin +\n(\d\)\s+\C 0 stbegin +\n(\d\d\)\s+\C 0 stbegin +\n(\d\)\s+\c 0 stbegin +\n(\d\)\s+?\c 0 stbegin +\n(\d\d\)\s+\c 0 stbegin +\n(\d\)\C 0 stbegin +\s\s(\d\)\C 0 stbegin +\s\s(\d\)?\s+\C 0 stbegin + +\c)\w+\d\)\s+\d+\s+(\c 0 stbegin + +\c\w+(\d\)\C 0 stbegin +\d\)\C+\w+(\d\)\c 0 stbegin + +\c)\w+\d\) 2 stend +\c)\w+\d\d\) 2 stend +\(\a+\w+\a+\) 3 stend +\c\c)\w+\d\d\), 3 stend +\c\c)\w+\d\), 3 stend +\c\c)\w+\d\). 3 stend +\c\c)\w+\d\d\). 3 stend +from \d+ to \d+ 3 stend + +\C\C)\w+\d\d\), 3 stend +\C\C)\w+\d\), 3 stend +\C\C)\w+\d\). 3 stend +\C\C)\w+\d\d\). 3 stend + +\C(\C)\w+\d\) 2 stend +\C(\C)\w+\d\d\) 2 stend +\(\C+\s+\d\d\) 3 stend +\(\c+\s+\d\d\) 3 stend +\d(%)\w+\d\) 2 stend +\d(%)\w+\d\d\) 2 stend +\d)\w+\d\) 2 stend +\d)\w+\d\d\) 2 stend +\d\d-\d\d\s+(.)\s+\C 2 stend +\d\d-\d\d\s+.\s+(\C 0 stbegin +\d\d\d(\d)\s+.\s+\C 2 stend +\d\d\d\d\s+.\s+(\C 0 stbegin + +\n(\d.\)\C 0 stbegin +\n(\d.\)\s+\C 0 stbegin +\n\s+(\d.\s+\C 0 stbegin +\n\s+(\d.\)\C 0 stbegin +\n\s+(\d.\)\s+\C 0 stbegin +\n\d.\s+(\d)\d-\d\d\s 0 stbegin +\n\d.\s+(\d)\d-\d\d\d\d\s 0 stbegin +\n\d.\s+(\d)\d-\d\d-\d\d\d\d\s 0 stbegin +\n\(a\)\s+(\C 0 stbegin +\n\(b\)\s+(\C 0 stbegin +\n\(c\)\s+(\C 0 stbegin +\n\(d\)\s+(\C 0 stbegin +\n\(e\)\s+(\C 0 stbegin +\n\(f\)\s+(\C 0 stbegin +\n\(g\)\s+(\C 0 stbegin +\n(\(\d\)\s+\C 0 stbegin +\n("\C 0 stbegin + + + +(\a)\s+\n+- 2 stend +\c(\c)\n+ \C 2 stend +\a(.) + 2 stend +\sms. 3 stend +\sMs. 3 stend +\sDr. 3 stend +\sdr. 3 stend +\sMrs. 3 stend +\sMr. 3 stend +\smr. 3 stend +\smrs. 3 stend +\sphd. 3 stend +\sb.i.d.\s+\c 3 stend +\sB.i.d.\s+\c 3 stend +\sB.I.D.\s+\c 3 stend +\sbid.\s+\c 3 stend +\sBID.\s+\c 3 stend +\st.i.d.\s+\c 3 stend +\sT.i.d.\s+\c 3 stend +\sT.I.D.\s+\c 3 stend +\stid.\s+\c 3 stend +\sTID.\s+\c 3 stend +\sq.i.d.\s+\c 3 stend +\sQ.i.d.\s+\c 3 stend +\sQ.I.D.\s+\c 3 stend +\sqid.\s+\c 3 stend +\sQID.\s+\c 3 stend +\sq.d.\s+\c 3 stend +\sQ.d.\s+\c 3 stend +\sQ.D.\s+\c 3 stend +\sqd.\s+\c 3 stend +\sQD.\s+\c 3 stend + + + + + mL(.)\s+The 2 stend +\c(.)\s+I 2 stend +\d(.)\s+\C 2 stend +\d(.)\s\C 2 stend +.\s+\d.\s+\C 2 stend +\)(.)\s+\C 2 stend +\p\p\p\s+\n\C 2 stend +\)(.)\s+\n\C 2 stend +\c(\c)\n+\C 2 stend +\c(\c)\s+\n+\C 2 stend +\a\s+\n\n 2 stend +\a\n\n 2 stend +\c)******** 2 stend +\c)**\n 2 stend +\c)**\s+\n 2 stend +\c)**\p+\s+\n 2 stend +\c)**\p+\n 2 stend +\c)\s+**\p+\s+\n 2 stend +\c\s+**\p+\n 2 stend +\c\s+\n\w+** 2 stend +\c.\s+\n\w+** 2 stend +\d(.)\s+\n\w+** 2 stend +(\d)**\p+\s+\n 2 stend +\d**\p+\n 2 stend +.\s+**\p+\s+\n 2 stend +.\s+**\p+\n 2 stend +.**\p+\s+\n 2 stend +.**\s+\p+\n 2 stend +.**\p+\s+\p+\n 2 stend +.**\p+\n 2 stend +.**\s+\n\n 2 stend +.**\n\n 2 stend +.\s+**\s+\n\n 2 stend +.\s+**\n\n 2 stend +:**\p+\n 2 stend +:**\s+\n\n 2 stend +:**\s+\n\w+ 2 stend +:**\n\n 2 stend +:\s+**\s+\n\n 2 stend +:\s+**\n\n 2 stend +:)\n\u\s+\C 2 stend +\d**\s+\n\n 2 stend +\d**\n\n 2 stend +\a\s+\n+** 2 stend +\a\s+\n\w+**\p+\C 2 stend +\c(.\s+**\p+\C 2 stend +\d)\s+\n+\d.\s+\C 2 stend +\d)\s+\n+\d\d.\s+\C 2 stend +\c)\s+\n+\d.\s+\C 2 stend +\c)\s+\n+\d\d.\s+\C 2 stend +\C)\s+\n+\d.\s+\C 2 stend +\C)\s+\n+\d\d.\s+\C 2 stend +\d)\s+\n+\d.\s+\c 2 stend +\c)\s+\n+\d.\s+\c 2 stend +\c)\s+\n+\d\d.\s+\c 2 stend +\C)\s+\n+\d.\s+\c 2 stend +\C)\s+\n+\d\d.\s+\c 2 stend + +\c(\))\s+\n\n 2 stend +\c\c(.)\s+\C 2 stend +\c(.)\s+\n 2 stend +\d(.)\n 2 stend +\c(:)\n 2 stend +\C(:)\n 2 stend +\d(:)\n 2 stend +\c(:)\s+\n 2 stend +\C(:)\s+\n 2 stend +\d(:)\s+\n 2 stend +\C\C\C(.)\s+\C\c 2 stend +\C(.)\n 2 stend +\)(.)\n 2 stend +](.)\n 2 stend +\c(.)\n 2 stend + + +\n\d+(.)\s+\C 3 stend +\d+.\C+(:)\s+\n 3 stend +Mrs(.) 3 stend +Miss(.) 3 stend +Mr(.) 3 stend +Ms(.) 3 stend +\c\n+\c 3 stend +\c\n+\s+\c 3 stend +\c\s+\n+\c 3 stend +\c\s+\n+\s+\c 3 stend + +,\w+\c\c 3 stend +,\n\w+\c\c 3 stend +,\w+\c\c 3 stend +,\w+\d+ 3 stend +,\n\w+\d+ 3 stend +,\w+\d+ 3 stend +;\w+\c\c 3 stend + +\)\w+\c\c 3 stend +\)\n\w+\c\c 3 stend +\)\w+\d 3 stend +\)\n\w+\d+ 3 stend +\)\w+\d 3 stend +\c)\s+\d+\)\s+\d+\s+. 2 stend +\d+\s+.\s+(\C 0 stbegin + +\s+\C(\C)\w+\c\c 3 stend +\s+\C\C(\C)\w+\c\c 3 stend +\s+\C\C\C(\C)\w+\c\c 3 stend + +A\w+\c\c 3 stend +A\n\w+\c\c 3 stend +A\w+\c\c 3 stend +A\w+\d+ 3 stend +A\n\w+\d+ 3 stend +A\w+\d+ 3 stend + +\d+)\w+week 3 stend +\d+)\w+month 3 stend +\d+)\w+\day 3 stend +\d+)\w+year 3 stend +\d+)\w+cm 3 stend +\d+)\w+m 3 stend +\d+)\w+mg 3 stend +\d+)\w+g 3 stend +\d+)\w+kg 3 stend +\d+)\w+lb 3 stend +\d+)\w+feet 3 stend +\d+)\w+inch 3 stend +\d+)\w+ml 3 stend +\d+)\w+ou 3 stend +\d+)\w+ounce 3 stend +\d+)\w+total dose 3 stend +\d+)\w+dose 3 stend +\d+)\w+tablet 3 stend + +#start with number + units +\c\n+\d+\s+\c\c 3 stend +\c\n+\s+\d+\s+\c\c 3 stend +\c\s+\n+\d+\s+\c\c 3 stend +\c\s+\n+\d+\s+\s+\c\c 3 stend +#start with float + units +\c\n+\d+.\d+\s+\c\c 3 stend +\c\n+\s+\d+.\d+\s+\c\c 3 stend +\c\s+\n+\d+.\d+\s+\c\c 3 stend +\c\s+\n+\d+.\d+\s+\s+\c\c 3 stend + +are:\s+\n+\c 3 stend +\sis:\s+\n+\c 3 stend +was:\s+\n+\c 3 stend +were:\s+\n+\c 3 stend +are:\n+\c 3 stend +\sis:\n+\c 3 stend +was:\n+\c 3 stend +were:\n+\c 3 stend +are:\n+\s+\c 3 stend +\sis:\n+\s+\c 3 stend +was:\n+\s+\c 3 stend +were:\n+\s+\c 3 stend +are:\s+\n+\s+\c 3 stend +\sis:\s+\n+\s+\c 3 stend +was:\s+\n+\s+\c 3 stend +were:\s+\n+\s+\c 3 stend +#:\n+\c)+ 3 stend +#:\s+\n+\c\c 3 stend +#:\n+\s+\c\c 3 stend +#:\s+\n+\s+\c\c 3 stend +\spulm. 3 stend + + mL\n+\c)+ 3 stend + mL\s+\n+\c\c 3 stend + mL\n+\s+\c\c 3 stend + mL\s+\n+\s+\c\c 3 stend + + +\a)\s+\n\n\n+ • 2 stend +\s+\n\n+\s+\C 2 stend +\d+.\s+\C+(:\s+\n\n+\s+\C\c+\s+\d+. 3 stend +\d+.\s+\C\c+(:\s+\n\n+\s+\C\c+\s+\d+. 3 stend +\a\w+_______________ 2 stend +\a(\p)\w+_______________ 2 stend +(\c)\n- \c 2 stend +(\c)\n- \C 2 stend +\c.(")\s+\C 2 stend +\c."\s+(\C 0 stbegin + +Heart\nFailure 3 stend + and\s+\n\n 3 stend + that\s+\n\n 3 stend + for\s+\n+ 3 stend + had\s+\n+ 3 stend + have\s+\n+ 3 stend + has\s+\n+ 3 stend + "I\s+\n\n 3 stend + I\s+\n\n 3 stend +\(\C+\s+\n\n 3 stend +\(\c+\s+\n\n 3 stend +\n(rhabdomyolysis:\n 0 stbegin +.\s+\n+(\c+\s+\c+\s+\c+\s+\c+:\n 0 stbegin +.\s+\n+(\c+\s+\c+\s+\c+:\n 0 stbegin +.\s+\n+(\c+\s+\c+\s+\c+:\n 0 stbegin +#\w+(H)istory of Present Illness: 0 stbegin +\c)\w+History of Present Illness: 2 stend +\C)\w+History of Present Illness: 2 stend +\p)\w+History of Present Illness: 2 stend +\c)\w+History of present illness: 2 stend +\C)\w+History of present illness: 2 stend +\p)\w+History of present illness: 2 stend +\c)\w+HISTORY OF PRESENT ILLNESS: 2 stend +\C)\w+HISTORY OF PRESENT ILLNESS: 2 stend +\p)\w+HISTORY OF PRESENT ILLNESS: 2 stend +\c)\w+Past Medical History: 2 stend +\C)\w+Past Medical History: 2 stend +\p)\w+Past Medical History: 2 stend +\c)\w+History of Past Illness: 2 stend +\C)\w+History of Past Illness: 2 stend +\p)\w+History of Past Illness: 2 stend +\c)\w+Chief Complaint: 2 stend +\C)\w+Chief Complaint: 2 stend +\p)\w+Chief Complaint: 2 stend +\c)\w+Chief Complaint: 2 stend +\C)\w+Chief Complaint: 2 stend +\p)\w+Chief Complaint: 2 stend +.)\s+The 2 stend +.\s+(The 0 stbegin +.\s+(\d.\s+\C 0 stbegin +.\s+(\d.\C 0 stbegin +\c(.\s+\d.\C 2 stend + + +\c)\w+REASON FOR 2 stend +\C)\w+REASON FOR 2 stend +\d)\w+REASON FOR 2 stend +\p)\w+REASON FOR 2 stend +\c)\w+\w+REASON FOR 2 stend +\C)\w+\w+REASON FOR 2 stend +\d)\w+\w+REASON FOR 2 stend +\p)\w+\w+REASON FOR 2 stend +\c)\w+Reason For 2 stend +\C)\w+Reason For 2 stend +\d)\w+Reason For 2 stend +\p)\w+Reason For 2 stend +\c)\w+\w+Reason For 2 stend +\C)\w+\w+Reason For 2 stend +\d)\w+\w+Reason For 2 stend +\p)\w+\w+Reason For 2 stend +R)EASON FOR 0 stbegin +#REASON FOR THIS EXAMINATION(: 2 stend +#REASON FOR\w+(\d 0 stbegin +#REASON FOR\w+(\C 0 stbegin +#REASON FOR\w+(\c 0 stbegin +#REASON FOR\w+(\p 0 stbegin +#Reason For This Examination(: 2 stend +#Reason For\w+(\d 0 stbegin +#Reason For\w+(\C 0 stbegin +#Reason For\w+(\c 0 stbegin +#Reason For\w+(\p 0 stbegin + + +\c)\w+INDICATION: 2 stend +\C)\w+INDICATION: 2 stend +\d)\w+INDICATION: 2 stend +\p)\w+INDICATION: 2 stend +\c)\w+Indication: 2 stend +\C)\w+Indication: 2 stend +\d)\w+Indication: 2 stend +\p)\w+Indication: 2 stend +#INDICATION(: 2 stend +#INDICATION:\w+(\d 0 stbegin +#INDICATION:\w+(\C 0 stbegin +#INDICATION:\w+(\c 0 stbegin +#INDICATION:\w+(\p 0 stbegin +#Indication(: 2 stend +#Indication:\w+(\d 0 stbegin +#Indication:\w+(\C 0 stbegin +#Indication:\w+(\c 0 stbegin +#Indication:\w+(\p 0 stbegin + + +\c)\w+REASON: 2 stend +\C)\w+REASON: 2 stend +\d)\w+REASON: 2 stend +\p)\w+REASON: 2 stend +\c)\w+Reason: 2 stend +\C)\w+Reason: 2 stend +\d)\w+Reason: 2 stend +\p)\w+Reason: 2 stend +#REASON(: 2 stend +#REASON:\w+(\d 0 stbegin +#REASON:\w+(\C 0 stbegin +#REASON:\w+(\c 0 stbegin +#REASON:\w+(\p 0 stbegin +#Reason(: 2 stend +#Reason:\w+(\d 0 stbegin +#Reason:\w+(\C 0 stbegin +#Reason:\w+(\c 0 stbegin +#Reason:\w+(\p 0 stbegin + +\a)\w+Admitting Diagnosis: 2 stend +\a)\w+ADMITTING DIAGNOSIS: 2 stend +\a\w+(A)dmitting Diagnosis: 0 stbegin +\a\w+(A)DMITTING DIAGNOSIS: 0 stbegin +#Admitting Diagnosis(: 2 stend +#Admitting Diagnosis:\w+(\d 0 stbegin +#Admitting Diagnosis:\w+(\C 0 stbegin +#Admitting Diagnosis:\w+(\c 0 stbegin +#Admitting Diagnosis:\w+(\p 0 stbegin +#ADMITTING DIAGNOSIS(: 2 stend +#ADMITTING DIAGNOSIS:\w+(\d 0 stbegin +#ADMITTING DIAGNOSIS:\w+(\C 0 stbegin +#ADMITTING DIAGNOSIS:\w+(\c 0 stbegin +#ADMITTING DIAGNOSIS:\w+(\p 0 stbegin + + +\c)\w+Discharge Diagnosis: 2 stend +\d)\w+Discharge Diagnosis: 2 stend +\p)\w+Discharge Diagnosis: 2 stend +\C)\w+Discharge Diagnosis: 2 stend +\c)\w+DISCHARGE DIAGNOSIS: 2 stend +\C)\w+DISCHARGE DIAGNOSIS: 2 stend +\d)\w+DISCHARGE DIAGNOSIS: 2 stend +\p)\w+DISCHARGE DIAGNOSIS: 2 stend +#Discharge Diagnosis(: 2 stend +#Discharge Diagnosis:\w+(\d 0 stbegin +#Discharge Diagnosis:\w+(\C 0 stbegin +#Discharge Diagnosis:\w+(\c 0 stbegin +#Discharge Diagnosis:\w+(\p 0 stbegin +#DISCHARGE DIAGNOSIS(: 2 stend +#DISCHARGE DIAGNOSIS:\w+(\d 0 stbegin +#DISCHARGE DIAGNOSIS:\w+(\C 0 stbegin +#DISCHARGE DIAGNOSIS:\w+(\c 0 stbegin +#DISCHARGE DIAGNOSIS:\w+(\p 0 stbegin + +\c)\w+FINDINGS: 2 stend +\C)\w+FINDINGS: 2 stend +\d)\w+FINDINGS: 2 stend +\p)\w+FINDINGS: 2 stend +F)INDINGS: 0 stbegin +#FINDINGS(: 2 stend +#FINDINGS:\w+(\d 0 stbegin +#FINDINGS:\w+(\C 0 stbegin +#FINDINGS:\w+(\c 0 stbegin +#FINDINGS:\w+(\p 0 stbegin +\c)\w+Findings: 2 stend +\C)\w+Findings: 2 stend +\d)\w+Findings: 2 stend +\p)\w+Findings: 2 stend +#Findings(: 2 stend +#Findings:\w+(\d 0 stbegin +#Findings:\w+(\C 0 stbegin +#Findings:\w+(\c 0 stbegin +#Findings:\w+(\p 0 stbegin + + +#Brief Hospital Course(: 2 stend +#Brief Hospital Course:\w+(\d 0 stbegin +#Brief Hospital Course:\w+(\C 0 stbegin +#Brief Hospital Course:\w+(\c 0 stbegin +#Brief Hospital Course:\w+(\p 0 stbegin + + + + +\c(?)\w+ 2 stend +\C(?)\w+ 2 stend +\d(?)\w+ 2 stend +:\w+(?\w+ 3 stend + + +D(.)\s+\n+\d+.\s+\C 2 stend +N(.)\s+\n+\d+.\s+\C 2 stend +NPO(.)\s+\n+\C 2 stend +\)(.)\w+\d+.\s+\C 2 stend +\c+\s+(.)\s+\C\c+ 2 stend +\c+\s+.\s+(\C\c+ 0 stbegin +P(M\n\C 2 stend +\a)\s\s\s\s+Reason: 2 stend +\s\s\s\s+(Reason: 0 stbegin +\a)\s\s\s\s+Admitting Diagnosis: 2 stend +\s\s\s\s+(Admitting Diagnosis: 0 stbegin +\a)\s\s\s\s+Sex: 2 stend +\s\s\s\s+(Sex: 0 stbegin +\a)\s\s\s\s+Discharge Date: 2 stend +\s\s\s\s+(Discharge Date: 0 stbegin +dail(y\n- 2 stend +qh(s\n- 2 stend +dail(y\n- 2 stend +\sq(d\n- 2 stend +QH(S\n- 2 stend +\a)\s+Refills: 2 stend +\a)*\s+Refills: 2 stend +\a\s+(Refills: 0 stbegin +\a)\n\C: 2 stend +\a)\s+\n\C: 2 stend +\n(\C: 0 stbegin +\n(JOB#: 0 stbegin +\a)\nJOB#: 2 stend +\a)\s+\nJOB#: 2 stend +\n(Signed\s 0 stbegin +\a)\s\s\s\w+Signed\s 2 stend +\c)\n+Signed\s 2 stend +\d)\n+Signed\s 2 stend +\p)\n+Signed\s 2 stend +\(End of Report 0 stbegin +\a)\w+\(End of Report\) 2 stend +Instructions(:\n+\a 2 stend +Instructions:\n+(\a 0 stbegin +\n+(Follow 0 stbegin +\a)\w+\n+Follow 2 stend +\d+\s+\n+total dose 3 stend +\a)\n+\C\c+: 2 stend +\a\n+(\C\c+: 0 stbegin +\a)\n+\C\C+: 2 stend +\a\n+(\C\C+: 0 stbegin +\a)\s+\n+\C\C+: 2 stend +\a\s+\n+(\C\C+: 0 stbegin + +\c)\n+\C\C+\s\(\a\a+\): 2 stend +\c\n+(\C\C+\s\(\a\a+\): 0 stbegin +\d)\n+\C\C+\s\(\a\a+\): 2 stend +\d\n+(\C\C+\s\(\a\a+\): 0 stbegin + +\a)\n+T\s+ 2 stend +\a\n+(T)\s+ 0 stbegin +\a)\n+P\s+ 2 stend +\a\n+(P)\s+ 0 stbegin +\a)\n+R\s+ 2 stend +\a\n+(R)\s+ 0 stbegin +\a)\s+\n+R\s+ 2 stend +\a\s+\n+(R)\s+ 0 stbegin +\a)\n+BP\s+ 2 stend +\a\n+(BP)\s+ 0 stbegin +\a)\n+O2\s+ 2 stend +\a\n+(O2)\s+ 0 stbegin +\a)\w+Sig:\s+ 2 stend +\a\w+(Sig:\s+ 0 stbegin +\)(.)\s+\n+\d.\s+\c+ 2 stend +\).\s+\n+(\d.\s+\c+ 0 stbegin +\)(.)\s+\n+\d.\s+\c+ 2 stend +\))\s+\n+\d.\s+\c+ 2 stend +\)\s+\n+(\d.\s+\c+ 0 stbegin +\))\n+\d.\s+\c+ 2 stend +\)\n+(\d.\s+\c+ 0 stbegin +\a\n+(\d.\s+\C 0 stbegin +\a)\n+\d.\s+\C 2 stend +\a)*\n+\d.\s+\C 2 stend + + + +\a)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 2 stend +\a)*+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 2 stend +\a\n+(\d)\d\d\d-\d\d-\d\d\s\s\s+ 0 stbegin +\a)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 2 stend +\a\s+\n+(\d)\d\d\d-\d\d-\d\d\s\s\s+ 0 stbegin +\C\C+\n+(\d+.\s+\C 0 stbegin +\C)\n+\d+.\s+\C 2 stend +\d)\n+\C\C+ 2 stend + +\c)\n+\C\c+\s\(\a\a+\): 2 stend +\c\n+(\C\c+\s\(\a\a+\): 0 stbegin +\d)\n+\C\c+\s\(\a\a+\): 2 stend +\d\n+(\C\c+\s\(\a\a+\): 0 stbegin + +\c.\s+(-)\s+\C 0 stbegin +\c(.)\s+-\s+\C 2 stend +\c\s+(-)\s+\C 0 stbegin +\c\s+-\s+\C 2 stend +\C\c+\s+-\s+\C\c+ 3 stend +\C\c+(:)\s+\n+\c 2 stend +\C\c+:\s+\n+(\c 0 stbegin +\C\c+:\n+\s+(\c 0 stbegin +\C\c+(:)\n+\c 2 stend +\C\c+:\n+(\c 0 stbegin +\C\C\C:\n+(\c 0 stbegin +\C\C\C\):\n+(\c 0 stbegin +\sand)\s+\n+\C 3 stend +\sand)\s+\n+\c 3 stend +\sand)\n+\C 3 stend +\sand)\n+\c 3 stend + + + +\c(:)\s+\p+\s+\n 2 stend +\s\s\s+(·)\s+\C 0 stbegin +\c)\s\s\s+·\s\C 2 stend +\c)\s\s\s+·\s\C 2 stend +\p)\s\s\s+·\s\C 2 stend +\C\c+\s+(-)\s+\C\c+ 1 stbegin +\s\s+(P)atient Name: 0 stbegin +\s\s\s+(P)rocedure Date: 0 stbegin +\s\s\s+(D)ate of Birth: 0 stbegin +\s\s\s+(A)ge: 0 stbegin +\s\s\s+(G)ender: 0 stbegin +\s\s\s+(N)ote Status: 0 stbegin + +\a)\s\s+Patient Name: 2 stend +\a)\s\s\s+Procedure Date: 2 stend +\a)\s\s\s+Date of Birth: 2 stend +\a)\s\s\s+Age: 2 stend +\a)\s\s\s+Gender: 2 stend +\a)\s\s\s+Note Status: 2 stend +\n\n(\(\a+ 0 stbegin +\c.\n+(\(\a+ 0 stbegin +\n+(\c)\c+: 0 stbegin +\a+\)\n\s+\c\c+: 2 stend +\a+\)\s+\n+\c\c+: 2 stend +\a+\)\n+\c\c+: 2 stend +\a+\)\s+\n+\s+\c\c+: 2 stend +\c\n+\c\c+: 2 stend + +On)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +On)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +on)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +on)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +by)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +by)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +since)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +since)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +Since)\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend +Since)\s+\n+\d\d\d\d-\d\d-\d\d\s\s\s+ 3 stend + +\a)\s+**+\s+\n\n 2 stend +\a)\s+**+\n\n 2 stend + +\a(\a)\n+\d\).\s+\C 2 stend +\a\n+\d\)(.)\s+\C 3 stend +\C\C\n+(\d\).\s+\C 0 stbegin +\c.\n+(\d\).\s+\C 0 stbegin +\C\C.\n+(\d\).\s+\C 0 stbegin +\c.\s+\n+(\d)\).\s+\C 0 stbegin +.\s+\n+\d\)(.)\s+\C 3 stend + + +\a\n+(\d)\d-\d\d\s+ 0 stbegin +\a\s+\n+(\d)\d-\d\d\s+ 0 stbegin +\a.\s+\n+(\d)\d-\d\d\s+ 0 stbegin + +\c\w+(\d)\)\s+\C 0 stbegin +\c\c+.\w+(\d)\)\s+\C 0 stbegin +\c+\c(.)\w+\d\)\s+\C 2 stend + + +\a)\s+\n+\C\c+\s+\C\c+: 2 stend +\a\s+\n+(\C)\c+\s+\C\c+: 0 stbegin +\a)\s+\n+\C\c+: 2 stend +\a\s+\n+(\C)\c+: 0 stbegin +\a)\w+Date of Birth: 2 stend +\w+(D)ate of Birth: 0 stbegin + +\c:\s+\n(\a 0 stbegin +\sDr(.)\s 3 stend +\sMr(.)\s 3 stend +\sMrs(.)\s 3 stend +\sMs(.)\s 3 stend +\sth(e)\n\C\c+ 3 stend +\sTh(e)\n\C\c+ 3 stend +\si(n)\n\a+ 3 stend +\sI(n)\n\a+ 3 stend +\sfo(r)\n\a+ 3 stend +\sb(y)\a+ 3 stend +\shi(s)\a+ 3 stend +\she(r)\a+ 3 stend +\swit(h)\a+ 3 stend +\so(n)\a+ 3 stend +\sO(n)\a+ 3 stend +\sunti(l)\a+ 3 stend +\sUnti(l)\a+ 3 stend +\so(f)\a+ 3 stend +\sthroug(h)\a+ 3 stend +\san(d)\a+ 3 stend +\so(r)\n\a+ 3 stend +\sa(s)\n\a+ 3 stend +\sincludin(g)\a+ 3 stend + + +Cardiac\w+Surgery\w+Intensive\w+Care\w+Unit 3 stend +\C\c+\w+Cardiac\w+Surge\(+ry\w+Intensive\w+Care\w+Unit 3 stend +Intensive\w+Care\w+Unit 3 stend +\C\c+\w+Intensive\w+Care\w+Unit 3 stend +Emergency\w+Department 3 stend +Coronary\w+Care\w+Unit 3 stend + +\a)*\n\n 2 stend +\a)*\n\d\d+.\s\C 2 stend +\a)*\n\d.\s\C 2 stend + +CENTER\w+(. 2 stend +Paterna(l 3 stend +HOSPITAL\w+(. 2 stend +CENTER\w+.\w+(\C 0 stbegin +HOSPITAL\w+.\w+(\C 0 stbegin +\C\c+\w+Surgery 3 stend + +\c.\c(.)\w+\C\c\c 2 stend +\wTR(.\w+\C\c\c 2 stend +\wTR.\w+(\C)\c\c 0 stbegin +.)\w+He 2 stend +.)\w+His 2 stend +.)\w+Her 2 stend +.)\w+She 2 stend +.)\w+We 2 stend +.)\w+Our 2 stend +.)\w+The 2 stend +.)\w+They 2 stend +.)\w+Their 2 stend +.)\w+I 2 stend +.)\w+My 2 stend + +.\w+(") 0 stbegin +.\w+(He 0 stbegin +.\w+(His 0 stbegin +.\w+(Her 0 stbegin +.\w+(She 0 stbegin +.\w+(We 0 stbegin +.\w+(Our 0 stbegin +.\w+(The 0 stbegin +.\w+(They 0 stbegin +.\w+(Their 0 stbegin +.\w+(I 0 stbegin +.\w+(My 0 stbegin +.\w+(But 0 stbegin +.\w+(Now 0 stbegin +.\w+(Discharge 0 stbegin +.\w+(This 0 stbegin +.\w+(That 0 stbegin +.\w+(this 0 stbegin +.\w+(that 0 stbegin +.\w+(he 0 stbegin +.\w+(she 0 stbegin +.\w+(we 0 stbegin +.\w+(our 0 stbegin +.\w+(his 0 stbegin +.\w+(her 0 stbegin +.\w+(they 0 stbegin +.\w+(their 0 stbegin +.\w+(my 0 stbegin +.\w+(but 0 stbegin diff --git a/examples/temporal_extraction/extract_temporal.py b/examples/temporal_extraction/extract_temporal.py new file mode 100644 index 00000000..c7395910 --- /dev/null +++ b/examples/temporal_extraction/extract_temporal.py @@ -0,0 +1,202 @@ +# the vast majority of the infrastructure in this program was inspired by: +# https://github.com/Machine-Learning-for-Medical-Language/curate-mimic/blob/main/extract_mimic_temporal.py +# unlike `extract_temporal`, this script reads from a directory instead of a single file. +import argparse +import json +import os +import pathlib +import pdb +import pickle +import requests +import sys + +from nltk.tokenize import wordpunct_tokenize as tokenize +from nltk.tokenize.util import align_tokens +from PyRuSH import RuSH +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("-d", "--data_dir", type=pathlib.Path, required=True, help="path to read data from. should be a directory (of json or txt files).") +parser.add_argument("-o", "--out_dir", type=pathlib.Path, required=True, help="directory in which to save output") +parser.add_argument("-s", "--sentence_dir", type=pathlib.Path, required=True, help="directory in which to save sentences") +parser.add_argument("-u", "--rest_url", type=str, default="http://0.0.0.0:8000/temporal/process", + help="Primary REST server. Use GPU REST server for high throughput.") +parser.add_argument("--backup_rest_url", type=str, default="http://0.0.0.0:8000/temporal/process", + help=("Backup REST server. This server will be used if the primary server fails to process " + "(often due to VRAM restrictions). Use a CPU REST server for stability, " + "especially with large or long documents.")) +parser.add_argument("--input_format", choices=["json", "pkl", "txt"], default="json") +parser.add_argument("--text_name", type=str, default="text", help="key to access the text in a dictionary format") +parser.add_argument("--output_format", choices=["json", "pkl"], default="json") +args = parser.parse_args() + +rush = RuSH("conf/rush_rules.tsv") +#rush = RuSH("conf/rush_rules_cr.tsv") # Use this if you want to use as a paragraph splitter. + + +def read_file(filename): + if args.input_format == "txt": + with open(os.path.join(args.data_dir, filename), "r") as f: + text = f.read() + elif args.input_format == "json": + with open(os.path.join(args.data_dir, filename), "r") as f: + text = json.load(f)[args.text_name] + elif args.input_format == "pkl": + with open(os.path.join(args.data_dir, filename), "rb") as f: + text = pickle.load(f)[args.text_name] + return text + + +def write_file(data, out_filename): + if args.output_format == "json": + with open(out_filename, "w") as f: + json.dump(data, f) + elif args.output_format == "pkl": + with open(out_filename, "wb") as f: + pickle.dump(data, f) + + +def preprocess(sents): + sent_tokens = [] + for sent in sents: + sent_text = text[sent.begin:sent.end] + tokens = tokenize(sent_text) + # NOTE: `extract_mimic_temporal.py` has some commented-out code here that's supposed to fix alignment issues + if text[sent.end-1] == "\n": + tokens.append("") + if len(tokens) > 0: + sent_tokens.append(tokens) + return sent_tokens + + +if __name__ == "__main__": + os.makedirs(args.out_dir, exist_ok=True) + os.makedirs(args.sentence_dir, exist_ok=True) + + in_files = [f for f in os.listdir(args.data_dir) if f.endswith("." + args.input_format)] + + retry_attempts_cnt = 0 + + for filename in tqdm(in_files): + bare_filename = filename.split(".")[0] + out_filename = bare_filename + "." + args.output_format + if os.path.exists(os.path.join(args.out_dir, out_filename)): + continue + text = read_file(filename) + if len(text) == 0: + sys.stderr.write(f"Empty file: {filename}") + continue + + sents = rush.segToSentenceSpans(text) + if len(sents) == 0: + sys.stderr.write(f"No sentences found in {filename}; skipping.") + continue + sent_tokens = preprocess(sents) + if len(sent_tokens) == 0: + sys.stderr.write(f"No sentences in {filename} were tokenizable; skipping.") + continue + + # send off to rest + r = requests.post(args.rest_url, json={"sent_tokens": sent_tokens, "metadata": f"FNAME={filename}"}) + if r.status_code == 500: + sys.stderr.write(f"Failed from primary server.\nRe-try with alternative server :{args.backup_rest_url}\n") + r = requests.post(args.backup_rest_url, json={"sent_tokens": sent_tokens, "metadata": f"FNAME={filename}"}) + retry_attempts_cnt += 1 + sys.stderr.write(f"Current retry_attempts_cnt: {retry_attempts_cnt}\n") + if r.status_code != 200: + raise Exception(f"Problem processing {filename}: status code {r.status_code}") + + out_json = r.json() + + events_docs, timexes_docs, rels_docs = [], [], [] + sent_text_list = [] + for sent_idx, sent in enumerate(sents): + events, timexes, rels = [], [], [] + sent_text = text[sent.begin:sent.end+1] + sent_text_list.append(sent_text) + sent_events = out_json["events"][sent_idx] + sent_timexes = out_json["timexes"][sent_idx] + sent_rels = out_json["relations"][sent_idx] + token_spans = align_tokens(sent_tokens[sent_idx], sent_text) + event_ids, timex_ids = [], [] + for timex in sent_timexes: + timex_start_offset = token_spans[timex["begin"]][0] + sent.begin + timex_end_offset = token_spans[timex["end"]][1] + sent.begin + timex_text = text[timex_start_offset:timex_end_offset] + timex_id = f"Timex_{bare_filename}_Sent-{sent_idx}_Ind-{len(timex_ids)}" + timex_ids.append(timex_id) + timexes.append({ + "note_id": bare_filename, # NOTE: this is "row_id" in `extract_mimic_temporal` + "entity_id":timex_id, + "sent_index": sent_idx, + "begin":timex["begin"], + "end":timex["end"], + "sent_begin":sent.begin, + "begin_char": token_spans[timex["begin"]][0], + "end_char": token_spans[timex["end"]][1], + "begin_origin": timex_start_offset, + "end_origin": timex_end_offset, + "text": timex_text, + "timeClas": timex["timeClass"]}) + for event in sent_events: + event_start_offset = token_spans[event["begin"]][0] + sent.begin + event_end_offset = token_spans[event["end"]][1] + sent.begin + event_text = text[event_start_offset:event_end_offset] + event_id = f"Event_{bare_filename}_Sent-{sent_idx}_Ind-{len(event_ids)}" + event_ids.append(event_id) + events.append({"note_id": bare_filename, # NOTE: this is "row_id" in `extract_mimic_temporal` + "entity_id": event_id, + "sent_index": sent_idx, + "begin":event["begin"], + "end":event["end"], + "sent_begin":sent.begin, + "begin_char": token_spans[event["begin"]][0] , + "end_char": token_spans[event["end"]][1], + "begin_origin": event_start_offset, + "end_origin": event_end_offset, + "text": event_text, + "dtr": event["dtr"]}) + + for rel in sent_rels: + if rel["arg1"] is None or rel["arg2"] is None: + continue + arg1_type, arg1_idx = rel["arg1"].split("-") + arg2_type, arg2_idx = rel["arg2"].split("-") + + if arg1_type == "EVENT": + arg1 = event_ids[int(arg1_idx)] + elif arg1_type == "TIMEX": + arg1 = timex_ids[int(arg1_idx)] + if arg1 == -1: + continue + + if arg2_type == "EVENT": + arg2 = event_ids[int(arg2_idx)] + elif arg2_type == "TIMEX": + arg2 = timex_ids[int(arg2_idx)] + if arg2 == -1: + continue + rels.append({"row_id": bare_filename, + "sent_index": sent_idx, + "arg1": arg1, + "arg2": arg2, + "category": rel["category"]}) + + timexes_docs.append(timexes) + events_docs.append(events) + rels_docs.append(rels) + + temporal_info = { + "timexes": timexes_docs, # TODO call these "timex" and "event"? + "events": events_docs, + "relations": rels_docs, + } + write_file( + temporal_info, + os.path.join(args.out_dir, out_filename) + ) + write_file( + {"sentences": sent_text_list}, + os.path.join(args.sentence_dir, out_filename) + ) + sys.stderr.write(f"Current retry_attempts_cnt: {retry_attempts_cnt}\n")