diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..02310e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,175 @@ + +.idea/misc.xml +.idea/dictionaries + +# Created by https://www.gitignore.io/api/intellij,jetbrains,java,maven,eclipse + +### Eclipse ### + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# Eclipse Core +.project + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/workspace.xml +.idea/tasks.xml + +# Sensitive or high-churn files: +.idea/dataSources/ +.idea/dataSources.ids +.idea/dataSources.xml +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +### Java ### +*.class + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + +### JetBrains ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: + +# Sensitive or high-churn files: + +# Gradle: + +# Mongo Explorer plugin: + +## File-based project format: + +## Plugin-specific files: + +# IntelliJ + +# mpeltonen/sbt-idea plugin + +# JIRA plugin + +# Crashlytics plugin (for Android Studio and IntelliJ) + +### JetBrains Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +### Maven ### +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties + +# Exclude maven wrapper +!/.mvn/wrapper/maven-wrapper.jar + +# End of https://www.gitignore.io/api/intellij,jetbrains,java,maven,eclipse diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..4e42293 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/copyright/Lambda___GPL_v3_.xml b/.idea/copyright/Lambda___GPL_v3_.xml new file mode 100644 index 0000000..d201bc0 --- /dev/null +++ b/.idea/copyright/Lambda___GPL_v3_.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/copyright/Lambda___MIT_.xml b/.idea/copyright/Lambda___MIT_.xml new file mode 100644 index 0000000..c987fe2 --- /dev/null +++ b/.idea/copyright/Lambda___MIT_.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/copyright/profiles_settings.xml b/.idea/copyright/profiles_settings.xml new file mode 100644 index 0000000..ca4cabf --- /dev/null +++ b/.idea/copyright/profiles_settings.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..b26911b --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..6612519 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..25ec0e9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..6c792a2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: java +script: mvn test -B +jdk: oraclejdk8 + +cache: + directories: + - $HOME/.m2 + +before_install: + - ./install-SentenceSimplification.sh diff --git a/DiscourseSimplification.iml b/DiscourseSimplification.iml new file mode 100644 index 0000000..6df30cf --- /dev/null +++ b/DiscourseSimplification.iml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..9cecc1d --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d4c9a7 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +[![Build Status](https://travis-ci.org/Lambda-3/DiscourseSimplification.svg?branch=master)](https://travis-ci.org/Lambda-3/DiscourseSimplification) + +# Discourse Simplification + +A project for simplifying sentences wrt. discourse/rhetorical structures. +This works as a wrapper for the [SentenceSimplification](https://github.com/Lambda-3/SentenceSimplification) project. + +## Dependencies + +### SentenceSimplification + +Clone and install locally + + git clone --branch v5.0.0 https://github.com/Lambda-3/SentenceSimplification.git + cd SentenceSimplification + mvn install + +## Building and Running + + mvn package + +### Run the program + + mvn clean compile exec:java + +## Use as library +Check `App.java`. +Or its usage in the [Graphene](https://github.com/Lambda-3/Graphene) project. + + diff --git a/install-SentenceSimplification.sh b/install-SentenceSimplification.sh new file mode 100755 index 0000000..7de0be1 --- /dev/null +++ b/install-SentenceSimplification.sh @@ -0,0 +1,7 @@ +#!/bin/sh +VERSION=5.0.0 + +set -ex +wget https://github.com/Lambda-3/SentenceSimplification/archive/v$VERSION.tar.gz +tar xfa v$VERSION.tar.gz +cd SentenceSimplification-$VERSION && mvn install -B -DskipTest diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..bf2088d --- /dev/null +++ b/pom.xml @@ -0,0 +1,174 @@ + + + + 4.0.0 + + org.lambda3.text.simplification + discourse-simplification + 5.0.0 + jar + + Discourse Simplification + + Discourse Simplification + + + scm:git:https://github.com/Lambda-3/DiscourseSimplification + scm:git:https://github.com/Lambda-3/DiscourseSimplification + https://github.com/Lambda-3/DiscourseSimplification + + + + + 5.0.0 + + 3.7.0 + 1.1.8 + + 1.0.0-M3 + 5.0.0-M3 + + 1.8 + UTF-8 + UTF-8 + + + + + + + org.lambda3.text.simplification + sentence-simplification + ${simplification.version} + + + + + edu.stanford.nlp + stanford-corenlp + ${corenlp.version} + + + slf4j-api + org.slf4j + + + + + edu.stanford.nlp + stanford-corenlp + ${corenlp.version} + models + + + slf4j-api + org.slf4j + + + + + + + ch.qos.logback + logback-classic + ${logback.version} + + + + + org.junit.jupiter + junit-jupiter-api + ${junit.jupiter.version} + test + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + true + + ${jdk.version} + ${jdk.version} + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + org.codehaus.mojo + exec-maven-plugin + 1.5.0 + + org.lambda3.text.simplification.discourse.App + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.0.0 + + + jar-with-dependencies + + + + + org.lambda3.text.simplification.sentence.segmentation.SentenceSeparator + + + + + + + package + + single + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + package + + jar-no-fork + + + + + + + diff --git a/src/main/java/org/lambda3/text/simplification/discourse/App.java b/src/main/java/org/lambda3/text/simplification/discourse/App.java new file mode 100644 index 0000000..e4f9f44 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/App.java @@ -0,0 +1,47 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : App + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse; + +import org.lambda3.text.simplification.discourse.processing.Processor; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +/** + * Hello world! + */ +public class App { + private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(App.class); + private static final Processor PROCESSOR = new Processor(); + + public static void main(String[] args) throws IOException { + List cores = PROCESSOR.process(new File("input.txt"), Processor.ProcessingType.WHOLE); +// List cores = PROCESSOR.process("The whole text...", Processor.ProcessingType.WHOLE); +// List cores = PROCESSOR.processWikipediaArticles(Arrays.asList("Barack_Obama"), Processor.ProcessingType.WHOLE); + + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/Test.java b/src/main/java/org/lambda3/text/simplification/discourse/Test.java new file mode 100644 index 0000000..e05ad6b --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/Test.java @@ -0,0 +1,66 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Test + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeVisualizer; +import org.lambda3.text.simplification.discourse.utils.sentences.SentencesUtils; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.List; + +/** + * + */ +public class Test { + private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(App.class); + + public static void printParseTree(File file) throws FileNotFoundException { + List sentences = SentencesUtils.splitIntoSentencesFromFile(file); + + printParseTree(sentences); + } + + public static void printParseTree(String text) { + List sentences = SentencesUtils.splitIntoSentences(text); + + printParseTree(sentences); + } + + public static void printParseTree(List sentences) { + for (String sentence : sentences) { + LOGGER.info("Generate parse tree for sentence:\n'{}'", sentence); + try { + Tree parseTree = ParseTreeParser.parse(sentence); + LOGGER.info(ParseTreeVisualizer.prettyPrint(parseTree)); + } catch (ParseTreeException e) { + LOGGER.error("Failed to generate parse tree"); + } + } + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/processing/ExtendedProcessor.java b/src/main/java/org/lambda3/text/simplification/discourse/processing/ExtendedProcessor.java new file mode 100644 index 0000000..4b47374 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/processing/ExtendedProcessor.java @@ -0,0 +1,80 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ExtendedProcessor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.processing; + +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.lambda3.text.simplification.discourse.utils.sentences.SentencesUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +public class ExtendedProcessor extends Processor { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public static List filterSentences(List sentences, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { + + // select sentences to process + List res = new ArrayList<>(); + res.addAll(sentences); + + // shuffle + if (shuffleSentences) { + Collections.shuffle(res); + } + + // remove too long sentences + if (maxSentenceLength != null) { + res = res.stream().filter(s -> s.length() <= maxSentenceLength).collect(Collectors.toList()); + } + + // limit number of sentences + if (maxSentences != null) { + if (res.size() > maxSentences) { + res = res.subList(0, maxSentences); + } + } + + return res; + } + + public List process(File file, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) throws FileNotFoundException { + return process(SentencesUtils.splitIntoSentencesFromFile(file), type, shuffleSentences, maxSentenceLength, maxSentences); + } + + public List process(String text, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { + return process(SentencesUtils.splitIntoSentences(text), type, shuffleSentences, maxSentenceLength, maxSentences); + } + + public List process(List sentences, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { + return process(filterSentences(sentences, shuffleSentences, maxSentenceLength, maxSentences), type); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/processing/Processor.java b/src/main/java/org/lambda3/text/simplification/discourse/processing/Processor.java new file mode 100644 index 0000000..44c582c --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/processing/Processor.java @@ -0,0 +1,163 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Processor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.processing; + +import org.lambda3.text.simplification.discourse.relation_extraction.DiscourseExtractor; +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseCore; +import org.lambda3.text.simplification.discourse.sentence_simplification.Simplifier; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.lambda3.text.simplification.discourse.tree.DiscourseTreeCreator; +import org.lambda3.text.simplification.discourse.utils.sentences.SentencesUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class Processor { + private final static DiscourseTreeCreator DISCOURSE_TREE_CREATOR = new DiscourseTreeCreator(); + private final static DiscourseExtractor DISCOURSE_EXTRACTOR = new DiscourseExtractor(); + private final static Simplifier SIMPLIFIER = new Simplifier(); + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public List process(File file, ProcessingType type) throws FileNotFoundException { + List sentences = SentencesUtils.splitIntoSentencesFromFile(file); + return process(sentences, type); + } + + public List process(String text, ProcessingType type) { + List sentences = SentencesUtils.splitIntoSentences(text); + return process(sentences, type); + } + + public List process(List sentences, ProcessingType type) { + if (type.equals(ProcessingType.SEPARATE)) { + return processSeparate(sentences); + } else if (type.equals(ProcessingType.WHOLE)) { + return processWhole(sentences); + } else { + throw new IllegalArgumentException("Unknown ProcessingType."); + } + } + + // creates one discourse tree over all sentences (investigates intra-sentential and inter-sentential relations) + private List processWhole(List sentences) { + List res = new ArrayList<>(); + + // Step 1) create document discourse tree + logger.info("Step 1) Create document discourse tree"); + DISCOURSE_TREE_CREATOR.reset(); + + int idx = 0; + for (String sentence : sentences) { + logger.info("### Processing sentence ###"); + logger.info(sentence); + + // extend discourse tree + DISCOURSE_TREE_CREATOR.addSentence(sentence, idx); + DISCOURSE_TREE_CREATOR.update(); + if (logger.isDebugEnabled()) { + + Optional.ofNullable(DISCOURSE_TREE_CREATOR.getLastSentenceTree()) + .ifPresent(t -> logger.debug(t.toString())); + +// logger.debug(DISCOURSE_TREE_CREATOR.getDiscourseTree().toString()); // to show the current document discourse tree + } + + ++idx; + } + + // Step 2) extract discourse cores + logger.info("Step 2) extract discourse cores"); + + List discourseCores = DISCOURSE_EXTRACTOR.extract(DISCOURSE_TREE_CREATOR.getDiscourseTree()); + if (logger.isDebugEnabled()) { + discourseCores.forEach(x -> logger.debug(x.toString())); + } + + // Step 3) generate output format + logger.info("Step 3) Generate output format"); + + List dCores = SIMPLIFIER.simplify(discourseCores); + res.addAll(dCores); + + if (logger.isInfoEnabled()) { + dCores.forEach(core -> logger.info(core.toString())); + } + + return res; + } + + // creates discourse trees for each individual sentence (investigates intra-sentential relations only) + private List processSeparate(List sentences) { + List res = new ArrayList<>(); + + int idx = 0; + for (String sentence : sentences) { + logger.info("### Processing sentence ###"); + logger.info("'" + sentence + "'"); + + // Step 1) create sentence discourse tree + logger.debug("Step 1) Create sentence discourse tree"); + DISCOURSE_TREE_CREATOR.reset(); + DISCOURSE_TREE_CREATOR.addSentence(sentence, idx); + DISCOURSE_TREE_CREATOR.update(); + if (logger.isDebugEnabled()) { + logger.debug(DISCOURSE_TREE_CREATOR.getDiscourseTree().toString()); + } + + // Step 2) extract discourse cores + logger.debug("Step 2) extract discourse cores"); + + List discourseCores = DISCOURSE_EXTRACTOR.extract(DISCOURSE_TREE_CREATOR.getDiscourseTree()); + if (logger.isDebugEnabled()) { + discourseCores.forEach(x -> logger.debug(x.toString())); + } + + // Step 3) generate output format + logger.debug("Step 3) generate output format"); + + List dCores = SIMPLIFIER.simplify(discourseCores); + res.addAll(dCores); + + if (logger.isInfoEnabled()) { + dCores.forEach(core -> logger.info(core.toString())); + } + + ++idx; + } + + return res; + } + + public enum ProcessingType { + SEPARATE, + WHOLE + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/DiscourseExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/DiscourseExtractor.java new file mode 100644 index 0000000..0178af4 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/DiscourseExtractor.java @@ -0,0 +1,175 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.relation_extraction; + +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseContext; +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseCore; +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreContextRelation; +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreCoreRelation; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.model.Coordination; +import org.lambda3.text.simplification.discourse.tree.model.DiscourseTree; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.tree.model.Subordination; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Created by Matthias on 08.12.16. + */ +public class DiscourseExtractor { + private static final List IGNORED_RELATIONS = Arrays.asList( + Relation.UNKNOWN_COORDINATION + ); + private final Logger logger = LoggerFactory.getLogger(getClass()); + private LinkedHashMap processedCores; + private LinkedHashMap processedContexts; + + public DiscourseExtractor() { + this.processedCores = new LinkedHashMap(); + this.processedContexts = new LinkedHashMap(); + } + + public List extract(DiscourseTree discourseTree) { + this.processedCores = new LinkedHashMap(); + this.processedContexts = new LinkedHashMap(); + + extractRec(discourseTree); + + return processedCores.values().stream().collect(Collectors.toList()); + } + + // should be called on a superordinate node + private List getCores(DiscourseTree node) { + List res = new ArrayList(); + + for (Leaf leaf : node.getNucleusPathLeaves()) { + DiscourseCore core; + if (processedCores.containsKey(leaf)) { + core = processedCores.get(leaf); + } else { + core = new DiscourseCore(leaf.getText(), leaf.getSentenceIdx()); + processedCores.put(leaf, core); + } + res.add(core); + } + + return res; + } + + // should be called on a subordinate node + private List getContexts(DiscourseTree node) { + List res = new ArrayList(); + + for (Leaf leaf : node.getNucleusPathLeaves()) { + DiscourseContext context; + if (processedContexts.containsKey(leaf)) { + context = processedContexts.get(leaf); + } else { + context = new DiscourseContext(leaf.getText(), leaf.getSentenceIdx()); + if (leaf.getType().equals(Leaf.Type.SENT_SIM_CONTEXT)) { + context.setSentSimContext(); + } + processedContexts.put(leaf, context); + + } + res.add(context); + } + + return res; + } + + // only visit nucleus nodes, do not handle References + private void extractRec(DiscourseTree node) { + + if (node instanceof Leaf) { + getCores(node); + } + + if (node instanceof Coordination) { + Coordination coordination = (Coordination) node; + + // recursion + for (DiscourseTree child : coordination.getCoordinations()) { + extractRec(child); + } + + // add core relations + if (!IGNORED_RELATIONS.contains(coordination.getRelation())) { + for (DiscourseTree child : coordination.getCoordinations()) { + List childCores = getCores(child); + + // forward direction + for (DiscourseTree sibling : coordination.getOtherFollowingCoordinations(child)) { + List siblingCores = getCores(sibling); + + for (DiscourseCore childCore : childCores) { + for (DiscourseCore siblingCore : siblingCores) { + childCore.addCoreRelation(new DiscourseCoreCoreRelation(coordination.getRelation(), siblingCore)); + } + } + } + + // reverse direction + if (coordination.getRelation().getReverseRelation().isPresent()) { + for (DiscourseTree sibling : coordination.getOtherPrecedingCoordinations(child)) { + List siblingCores = getCores(sibling); + + for (DiscourseCore childCore : childCores) { + for (DiscourseCore siblingCore : siblingCores) { + childCore.addCoreRelation(new DiscourseCoreCoreRelation(coordination.getRelation().getReverseRelation().get(), siblingCore)); + } + } + } + } + + } + } + } + + if (node instanceof Subordination) { + Subordination subordination = (Subordination) node; + + // recursion + extractRec(subordination.getSuperordination()); + + // add context relations + if (!IGNORED_RELATIONS.contains(subordination.getRelation())) { + List cores = getCores(subordination.getSuperordination()); + List contexts = getContexts(subordination.getSubordination()); + + for (DiscourseCore core : cores) { + for (DiscourseContext context : contexts) { + core.addContextRelation(new DiscourseCoreContextRelation(subordination.getRelation(), context)); + } + } + } + } + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseContext.java b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseContext.java new file mode 100644 index 0000000..8ce971c --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseContext.java @@ -0,0 +1,71 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseContext + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.relation_extraction.element; + +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class DiscourseContext implements PrettyTreePrinter.Node { + private final String text; + private final int sentenceIdx; + private boolean sentSimContext; + + public DiscourseContext(String text, int sentenceIdx) { + this.text = text; + this.sentenceIdx = sentenceIdx; + this.sentSimContext = false; + } + + public void setSentSimContext() { + this.sentSimContext = true; + } + + public String getText() { + return text; + } + + public int getSentenceIdx() { + return sentenceIdx; + } + + public boolean isSentSimContext() { + return sentSimContext; + } + + @Override + public List getPTPCaption() { + String sentSimContextStr = (sentSimContext) ? " [s-context]" : ""; + return Collections.singletonList("'" + text + "'" + sentSimContextStr); + } + + @Override + public List getPTPEdges() { + return new ArrayList<>(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseCore.java b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseCore.java new file mode 100644 index 0000000..0c09de1 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/element/DiscourseCore.java @@ -0,0 +1,102 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseCore + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.relation_extraction.element; + +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreContextRelation; +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreCoreRelation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +public class DiscourseCore implements PrettyTreePrinter.Node { + private final String text; + private final int sentenceIdx; + private final List coreRelations; + private final List contextRelations; + + public DiscourseCore(String text, int sentenceIdx) { + this.text = text; + this.sentenceIdx = sentenceIdx; + this.coreRelations = new ArrayList<>(); + this.contextRelations = new ArrayList<>(); + } + + public String getText() { + return text; + } + + public int getSentenceIdx() { + return sentenceIdx; + } + + public void addCoreRelation(DiscourseCoreCoreRelation coreRelation) { + if (!coreRelations.contains(coreRelation)) { + coreRelations.add(coreRelation); + } + } + + public List getCoreRelations() { + return coreRelations; + } + + public void addContextRelation(DiscourseCoreContextRelation contextRelation) { + if (!contextRelations.contains(contextRelation)) { + contextRelations.add(contextRelation); + } + } + + public List getContextRelations() { + return contextRelations; + } + + @Override + public List getPTPCaption() { + return Collections.singletonList("'" + text + "'"); + } + + @Override + public List getPTPEdges() { + List res = new ArrayList<>(); + + res.addAll(coreRelations.stream().map( + cr -> new PrettyTreePrinter.DefaultEdge("", cr.getCore(), false) + ).collect(Collectors.toList())); + + res.addAll(contextRelations.stream().map( + cr -> new PrettyTreePrinter.DefaultEdge("", cr.getContext(), true) + ).collect(Collectors.toList())); + + return res; + } + + @Override + public String toString() { + return PrettyTreePrinter.prettyPrint(this, false, 40); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreContextRelation.java b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreContextRelation.java new file mode 100644 index 0000000..1a8d601 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreContextRelation.java @@ -0,0 +1,54 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseCoreContextRelation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.relation_extraction.relation; + +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseContext; +import org.lambda3.text.simplification.discourse.tree.Relation; + +/** + * + */ +public class DiscourseCoreContextRelation { + private final Relation relation; + private final DiscourseContext context; + + public DiscourseCoreContextRelation(Relation relation, DiscourseContext context) { + this.relation = relation; + this.context = context; + } + + public Relation getRelation() { + return relation; + } + + public DiscourseContext getContext() { + return context; + } + + @Override + public boolean equals(Object o) { + return ((o instanceof DiscourseCoreContextRelation) + && (((DiscourseCoreContextRelation) o).relation.equals(relation)) + && (((DiscourseCoreContextRelation) o).context.equals(context))); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreCoreRelation.java b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreCoreRelation.java new file mode 100644 index 0000000..6ec9f79 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/relation_extraction/relation/DiscourseCoreCoreRelation.java @@ -0,0 +1,54 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseCoreCoreRelation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.relation_extraction.relation; + +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseCore; +import org.lambda3.text.simplification.discourse.tree.Relation; + +/** + * + */ +public class DiscourseCoreCoreRelation { + private final Relation relation; + private final DiscourseCore core; + + public DiscourseCoreCoreRelation(Relation relation, DiscourseCore core) { + this.relation = relation; + this.core = core; + } + + public Relation getRelation() { + return relation; + } + + public DiscourseCore getCore() { + return core; + } + + @Override + public boolean equals(Object o) { + return ((o instanceof DiscourseCoreCoreRelation) + && (((DiscourseCoreCoreRelation) o).relation.equals(relation)) + && (((DiscourseCoreCoreRelation) o).core.equals(core))); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/Simplifier.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/Simplifier.java new file mode 100644 index 0000000..b0902bc --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/Simplifier.java @@ -0,0 +1,194 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Simplifier + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.sentence.transformation.CoreContextSentence; +import org.lambda3.text.simplification.sentence.transformation.SentenceSimplifyingException; +import org.lambda3.text.simplification.sentence.transformation.Transformer; +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseContext; +import org.lambda3.text.simplification.discourse.relation_extraction.element.DiscourseCore; +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreContextRelation; +import org.lambda3.text.simplification.discourse.relation_extraction.relation.DiscourseCoreCoreRelation; +import org.lambda3.text.simplification.discourse.sentence_simplification.classification.SContextClassifier; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DContext; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.SContext; +import org.lambda3.text.simplification.discourse.sentence_simplification.relation.DContextRelation; +import org.lambda3.text.simplification.discourse.sentence_simplification.relation.DCoreRelation; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class Simplifier { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private LinkedHashMap processedDiscourseCores; + private LinkedHashMap processedDiscourseContexts; + + public Simplifier() { + this.processedDiscourseCores = new LinkedHashMap<>(); + this.processedDiscourseContexts = new LinkedHashMap<>(); + } + + private static SContext createSContext(String text, int sentenceIdx) { + Optional relation = SContextClassifier.classify(text); + return relation.map(relation1 -> new SContext(text, sentenceIdx, relation1)).orElseGet(() -> new SContext(text, sentenceIdx, Relation.UNKNOWN_SENT_SIM)); + } + + private DContext getDContext(DiscourseContext discourseContext) { + DContext res; + + if (processedDiscourseContexts.containsKey(discourseContext)) { + res = processedDiscourseContexts.get(discourseContext); + } else { + String text = discourseContext.getText(); + List sentSimContexts = new ArrayList<>(); + + // apply sentence simplification + Transformer t = new Transformer(); + try { + logger.debug("Simplifying: '{}'", discourseContext.getText()); + CoreContextSentence s = t.simplify(discourseContext.getText()); + + // set coreText (assume that there is usually only one core) + if ((s.getCore() != null) && (s.getCore().size() > 0)) { + Tree c = s.getCore().get(0); + if (c != null) { + text = WordsUtils.wordsToString(c.yieldWords()); + } + } + + // add (sentence simplification) contexts + if (s.getContext() != null) { + for (Tree c : s.getContext()) { + if (c != null) { + sentSimContexts.add(createSContext(WordsUtils.wordsToString(c.yieldWords()), discourseContext.getSentenceIdx())); + } + } + } + } catch (SentenceSimplifyingException e) { + // nothing + } + + res = new DContext(text, discourseContext.getSentenceIdx(), discourseContext.getText()); + + // add (sentence simplification) context relations + for (SContext sentSimContext : sentSimContexts) { + res.addSContext(sentSimContext); + } + + processedDiscourseContexts.put(discourseContext, res); + } + + return res; + } + + private DCore getDCore(DiscourseCore discourseCore) { + DCore res; + + if (processedDiscourseCores.containsKey(discourseCore)) { + res = processedDiscourseCores.get(discourseCore); + } else { + String text = discourseCore.getText(); + List sentSimContexts = new ArrayList<>(); + + // apply sentence simplification + Transformer t = new Transformer(); + try { + logger.debug("Simplifying: '{}'", discourseCore.getText()); + CoreContextSentence s = t.simplify(discourseCore.getText()); + + // set coreText (assume that there is usually only one core) + if ((s.getCore() != null) && (s.getCore().size() > 0)) { + Tree c = s.getCore().get(0); + if (c != null) { + text = WordsUtils.wordsToString(c.yieldWords()); + } + } + + // add (sentence simplification) contexts + if (s.getContext() != null) { + for (Tree c : s.getContext()) { + if (c != null) { + sentSimContexts.add(createSContext(WordsUtils.wordsToString(c.yieldWords()), discourseCore.getSentenceIdx())); + } + } + } + } catch (SentenceSimplifyingException e) { + // nothing + } + + res = new DCore(text, discourseCore.getSentenceIdx(), discourseCore.getText()); + + // add (sentence simplification) context relations + for (SContext sentSimContext : sentSimContexts) { + res.addSContext(sentSimContext); + } + + processedDiscourseCores.put(discourseCore, res); + } + + return res; + } + + public List simplify(List discourseCores) { + this.processedDiscourseCores = new LinkedHashMap<>(); + this.processedDiscourseContexts = new LinkedHashMap<>(); + + List res = new ArrayList<>(); + + for (DiscourseCore discourseCore : discourseCores) { + DCore dCore = getDCore(discourseCore); + + // add (discourse) core relations + for (DiscourseCoreCoreRelation discourseCoreCoreRelation : discourseCore.getCoreRelations()) { + dCore.addDCoreRelation(new DCoreRelation(discourseCoreCoreRelation.getRelation(), getDCore(discourseCoreCoreRelation.getCore()))); + } + + // add (discourse) context relations + for (DiscourseCoreContextRelation discourseCoreContextRelation : discourseCore.getContextRelations()) { + + // convert into a DContext or a SContext + if (discourseCoreContextRelation.getContext().isSentSimContext()) { + dCore.addSContext(new SContext(discourseCoreContextRelation.getContext().getText(), discourseCoreContextRelation.getContext().getSentenceIdx(), discourseCoreContextRelation.getRelation())); + } else { + dCore.addDContextRelation(new DContextRelation(discourseCoreContextRelation.getRelation(), getDContext(discourseCoreContextRelation.getContext()))); + } + } + + res.add(dCore); + } + + return res; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/classification/SContextClassifier.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/classification/SContextClassifier.java new file mode 100644 index 0000000..2c05e09 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/classification/SContextClassifier.java @@ -0,0 +1,134 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SContextClassifier + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.classification; + +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.ner.NERStringParseException; +import org.lambda3.text.simplification.discourse.utils.ner.NERStringParser; +import org.lambda3.text.simplification.discourse.utils.ner.tner.TNERString; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * + */ +public class SContextClassifier { + private static final Logger LOGGER = LoggerFactory.getLogger(SContextClassifier.class); + + private static final String PATTERN_PREFIX = "^.*(? monthPatterns = Stream.of( + "january", "jan.", + "february", "feb.", + "march", "mar.", + "april", "apr.", + "may", + "june", + "july", + "august", "aug.", + "september", "sept.", + "october", "oct.", + "november", "nov.", + "december", "dec." + ).map(p -> PATTERN_PREFIX + p + PATTERN_SUFFIX).collect(Collectors.toList()); + + final List days = Stream.of( + "monday", "mon.", + "tuesday", "tues.", + "wednesday", "wed.", + "thursday", "thurs.", + "friday", "fri.", + "saturday", "sat.", + "sunday", "sun." + ).map(p -> PATTERN_PREFIX + p + PATTERN_SUFFIX).collect(Collectors.toList()); + + final String yearPattern = PATTERN_PREFIX + "[1-2]\\d\\d\\d" + PATTERN_SUFFIX; + final String bcadPattern = PATTERN_PREFIX + "(\\d+\\s+(bc|ad)|ad\\s+\\d+)" + PATTERN_SUFFIX; + final String centuryPattern = PATTERN_PREFIX + "(1st|2nd|3rd|\\d+th)\\s+century" + PATTERN_SUFFIX; + final String timePattern = PATTERN_PREFIX + "([0-1]?\\d|2[0-4])\\s*:\\s*[0-5]\\d" + PATTERN_SUFFIX; + + String text = WordsUtils.wordsToString(np.yieldWords()).toLowerCase(); + return ((monthPatterns.stream().anyMatch(text::matches)) + || (days.stream().anyMatch(text::matches)) + || (text.matches(yearPattern)) + || (text.matches(bcadPattern)) + || (text.matches(centuryPattern)) + || (text.matches(timePattern))); + } + + private static boolean isLocationNP(Tree np) { + try { + TNERString ner = NERStringParser.parse(np); + + return ner.getTokens().stream().anyMatch(t -> t.getCategory().equals("LOCATION")); + } catch (NERStringParseException e) { + return false; + } + } + + public static Optional classify(String sContext) { + + try { + Tree parseTree = ParseTreeParser.parse(sContext); + + // find TIME-relation + TregexPattern p = TregexPattern.compile("ROOT <<, (/This/ . (/(is|was)/ . (/(in|at|around)/ . NP=np)))"); + TregexMatcher matcher = p.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + if (isTimeNP(matcher.getNode("np"))) { + return Optional.of(Relation.TIME); + } + } + + // find LOCATION-relation + p = TregexPattern.compile("ROOT <<, (/This/ . (/(is|was)/ . (__ . NP=np)))"); + matcher = p.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + if (isLocationNP(matcher.getNode("np"))) { + return Optional.of(Relation.LOCATION); + } + } + + + } catch (ParseTreeException e) { + LOGGER.error("Could not generate parse tree for sContext: '" + sContext + "'"); + } + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DContext.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DContext.java new file mode 100644 index 0000000..e6d45bb --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DContext.java @@ -0,0 +1,100 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DContext + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.element; + +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * + */ +public class DContext implements PrettyTreePrinter.Node { + private final String text; + private final int sentenceIndex; + private final String notSimplifiedText; + private final List sContexts; + + public DContext(String text, int sentenceIndex, String notSimplifiedText) { + this.text = text; + this.sentenceIndex = sentenceIndex; + this.notSimplifiedText = notSimplifiedText; + this.sContexts = new ArrayList<>(); + } + + public void addSContext(SContext sContext) { + this.sContexts.add(sContext); + } + + public String getText() { + return text; + } + + public int getSentenceIndex() { + return sentenceIndex; + } + + public String getNotSimplifiedText() { + return notSimplifiedText; + } + + public List getSContexts() { + return sContexts; + } + + @Override + public List getPTPCaption() { + return Collections.singletonList("'" + text + "'"); + } + + @Override + public List getPTPEdges() { + List res = new ArrayList<>(); + + res.addAll(sContexts.stream().map( + sc -> new PrettyTreePrinter.DefaultEdge("", sc, true) + ).collect(Collectors.toList())); + + return res; + } + + @Override + public String toString() { + return PrettyTreePrinter.prettyPrint(this, false, 40); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DContext)) return false; + DContext dContext = (DContext) o; + return getSentenceIndex() == dContext.getSentenceIndex() && + Objects.equals(getText(), dContext.getText()) && + Objects.equals(getNotSimplifiedText(), dContext.getNotSimplifiedText()) && + Objects.equals(sContexts, dContext.sContexts); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DCore.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DCore.java new file mode 100644 index 0000000..e0cdd58 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/DCore.java @@ -0,0 +1,136 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DCore + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.element; + +import org.lambda3.text.simplification.discourse.sentence_simplification.relation.DContextRelation; +import org.lambda3.text.simplification.discourse.sentence_simplification.relation.DCoreRelation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * + */ +public class DCore implements PrettyTreePrinter.Node { + private final String text; + private final int sentenceIndex; + private final String notSimplifiedText; + private final List dCoreRelations; + private final List dContextRelations; + private final List sContexts; + + public DCore(String text, int sentenceIndex, String notSimplifiedText) { + this.text = text; + this.sentenceIndex = sentenceIndex; + this.notSimplifiedText = notSimplifiedText; + this.dCoreRelations = new ArrayList<>(); + this.dContextRelations = new ArrayList<>(); + this.sContexts = new ArrayList<>(); + } + + public void addDCoreRelation(DCoreRelation dCoreRelation) { + if (!dCoreRelations.contains(dCoreRelation)) { + dCoreRelations.add(dCoreRelation); + } + } + + public void addDContextRelation(DContextRelation dContextRelation) { + if (!dContextRelations.contains(dContextRelation)) { + dContextRelations.add(dContextRelation); + } + } + + public void addSContext(SContext sContext) { + this.sContexts.add(sContext); + } + + public String getText() { + return text; + } + + public int getSentenceIndex() { + return sentenceIndex; + } + + public String getNotSimplifiedText() { + return notSimplifiedText; + } + + public List getDCoreRelations() { + return dCoreRelations; + } + + public List getDContextRelations() { + return dContextRelations; + } + + public List getSContexts() { + return sContexts; + } + + + @Override + public List getPTPCaption() { + return Collections.singletonList("'" + text + "'"); + } + + @Override + public List getPTPEdges() { + List res = new ArrayList<>(); + + res.addAll(dCoreRelations.stream().map( + cr -> new PrettyTreePrinter.DefaultEdge("", cr.getDCore(), false) + ).collect(Collectors.toList())); + + res.addAll(dContextRelations.stream().map( + cr -> new PrettyTreePrinter.DefaultEdge("", cr.getDContext(), true) + ).collect(Collectors.toList())); + + res.addAll(sContexts.stream().map( + sc -> new PrettyTreePrinter.DefaultEdge("", sc, true) + ).collect(Collectors.toList())); + + return res; + } + + @Override + public String toString() { + return PrettyTreePrinter.prettyPrint(this, false, 40); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DCore)) return false; + DCore dCore = (DCore) o; + return getSentenceIndex() == dCore.getSentenceIndex() && + Objects.equals(getText(), dCore.getText()) && + Objects.equals(getNotSimplifiedText(), dCore.getNotSimplifiedText()) && + Objects.equals(toString(), dCore.toString()); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/SContext.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/SContext.java new file mode 100644 index 0000000..f26d964 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/element/SContext.java @@ -0,0 +1,78 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SContext + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.element; + +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** + * + */ +public class SContext implements PrettyTreePrinter.Node { + private final String text; + private final int sentenceIndex; + private final Relation relation; + + public SContext(String text, int sentenceIndex, Relation relation) { + this.text = text; + this.sentenceIndex = sentenceIndex; + this.relation = relation; + } + + public String getText() { + return text; + } + + public int getSentenceIndex() { + return sentenceIndex; + } + + public Relation getRelation() { + return relation; + } + + @Override + public List getPTPCaption() { + return Collections.singletonList("'" + text + "'"); + } + + @Override + public List getPTPEdges() { + return new ArrayList<>(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof SContext)) return false; + SContext sContext = (SContext) o; + return getSentenceIndex() == sContext.getSentenceIndex() && + Objects.equals(getText(), sContext.getText()) && + getRelation() == sContext.getRelation(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DContextRelation.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DContextRelation.java new file mode 100644 index 0000000..b418fde --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DContextRelation.java @@ -0,0 +1,58 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DContextRelation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.relation; + +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DContext; +import org.lambda3.text.simplification.discourse.tree.Relation; + +import java.util.Objects; + +/** + * + */ +public class DContextRelation { + private final Relation relation; + private final DContext dContext; + + public DContextRelation(Relation relation, DContext dContext) { + this.relation = relation; + this.dContext = dContext; + } + + public Relation getRelation() { + return relation; + } + + public DContext getDContext() { + return dContext; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DContextRelation)) return false; + DContextRelation that = (DContextRelation) o; + return getRelation() == that.getRelation() && + Objects.equals(dContext, that.dContext); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DCoreRelation.java b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DCoreRelation.java new file mode 100644 index 0000000..3911bc1 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/sentence_simplification/relation/DCoreRelation.java @@ -0,0 +1,59 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DCoreRelation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.sentence_simplification.relation; + +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.lambda3.text.simplification.discourse.tree.Relation; + +import java.util.Objects; + +/** + * + */ +public class DCoreRelation { + private final Relation relation; + private final DCore dCore; + + public DCoreRelation(Relation relation, DCore dCore) { + this.relation = relation; + this.dCore = dCore; + } + + public Relation getRelation() { + return relation; + } + + public DCore getDCore() { + return dCore; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DCoreRelation)) return false; + DCoreRelation that = (DCoreRelation) o; + return getRelation() == that.getRelation() && + Objects.equals(dCore, that.dCore); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/DiscourseTreeCreator.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/DiscourseTreeCreator.java new file mode 100644 index 0000000..db65e94 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/DiscourseTreeCreator.java @@ -0,0 +1,233 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseTreeCreator + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.CoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.extraction.model.RefCoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.extraction.model.RefSubordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.extraction.model.SubordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.extraction.rules.*; +import org.lambda3.text.simplification.discourse.tree.model.*; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeVisualizer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class DiscourseTreeCreator { + private static final List rules; + + static { + rules = new ArrayList<>(); + + rules.add(new ReferenceExtractorForContainingWords()); + rules.add(new ReferenceExtractorForPrecedingWords()); + rules.add(new CoordinationExtractor()); + rules.add(new SharedNPCoordinationExtractor()); + rules.add(new SubordinationExtractor()); + rules.add(new IntraSententialSubordinationExtraction()); + rules.add(new RightSubordinateEnablementExtractor()); + rules.add(new LeftSubordinateEnablementExtractor()); + + rules.add(new ListNPExtractor("ROOT <<: (S < (NP=np < (NP $.. NP) $.. VP))")); + rules.add(new ListNPExtractor("ROOT <<: (S < (NP $.. (VP << (NP=np < (NP $.. NP)))))")); + } + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private Coordination discourseTree; + + public DiscourseTreeCreator() { + reset(); + } + + public void reset() { + this.discourseTree = new Coordination( + "ROOT", + Relation.UNKNOWN_COORDINATION, + null, + new ArrayList<>() + + ); + } + + public void addSentence(String sentence, int sentenceIdx) { + discourseTree.addCoordination(new SentenceLeaf(sentence, sentenceIdx)); + } + + public DiscourseTree getLastSentenceTree() { + DiscourseTree res = null; + if (discourseTree.getCoordinations().size() > 0) { + res = discourseTree.getCoordinations().get(discourseTree.getCoordinations().size() - 1); + } + + return res; + } + + public Coordination getDiscourseTree() { + return discourseTree; + } + + public void update() { + processDiscourseTreeRec(discourseTree); + discourseTree.cleanup(); + } + + private void processDiscourseTreeRec(DiscourseTree discourseTree) { + + if (discourseTree instanceof Coordination) { + Coordination coordination = (Coordination) discourseTree; + + for (DiscourseTree child : coordination.getCoordinations()) { + + // process coordination-leaf if not processed yet + if (child.isNotProcessed()) { + DiscourseTree c = child; + + if (child instanceof Leaf) { + Optional newChild = applyRules((Leaf) child); + if (newChild.isPresent()) { + coordination.replaceCoordination(child, newChild.get()); + c = newChild.get(); + } + } + + child.setProcessed(); + + // recursion + processDiscourseTreeRec(c); + } + } + } + + if (discourseTree instanceof Subordination) { + Subordination subordination = (Subordination) discourseTree; + + // process superordination-leaf if not processed yet + if (subordination.getSuperordination().isNotProcessed()) { + + if (subordination.getSuperordination() instanceof Leaf) { + Optional newChild = applyRules((Leaf) subordination.getSuperordination()); + newChild.ifPresent(subordination::replaceSuperordination); + } + + subordination.getSuperordination().setProcessed(); + + // recursion + processDiscourseTreeRec(subordination.getSuperordination()); + } + + // process subordination-leaf if not processed yet + if (subordination.getSubordination().isNotProcessed()) { + + if (subordination.getSubordination() instanceof Leaf) { + Optional newChild = applyRules((Leaf) subordination.getSubordination()); + newChild.ifPresent(subordination::replaceSubordination); + } + + subordination.getSubordination().setProcessed(); + + // recursion + processDiscourseTreeRec(subordination.getSubordination()); + } + } + } + + private Optional applyRules(Leaf leaf) { + logger.debug("Processing leaf:"); + if (logger.isDebugEnabled()) { + logger.debug(leaf.toString()); + } + + if ((leaf.getType().equals(Leaf.Type.TERMINAL)) || (leaf.getType().equals(Leaf.Type.SENT_SIM_CONTEXT))) { + logger.debug("Leaf will not be split."); + return Optional.empty(); + } + + // try to generate parseTree + Tree parseTree; + try { + parseTree = ParseTreeParser.parse(leaf.getText()); + } catch (ParseTreeException e) { + logger.error("Failed to generate parse tree"); + + return Optional.empty(); + } + logger.debug("Parse tree:"); + if (logger.isDebugEnabled()) { + logger.debug(ParseTreeVisualizer.prettyPrint(parseTree)); + } + + // check rules + for (ExtractionRule rule : rules) { + + Optional extraction = rule.extract(parseTree); + if (extraction.isPresent()) { + logger.debug("Extraction rule " + rule.getClass().getSimpleName() + " matched."); + + // handle CoordinationExtraction + if (extraction.get() instanceof CoordinationExtraction) { + return Optional.of(((CoordinationExtraction) extraction.get()).convert()); + } + + // handle SubordinationExtraction + if (extraction.get() instanceof SubordinationExtraction) { + return Optional.of(((SubordinationExtraction) extraction.get()).convert()); + } + + // handle RefCoordinationExtraction + if (extraction.get() instanceof RefCoordinationExtraction) { + Optional r = ((RefCoordinationExtraction) extraction.get()).convert(leaf); + if (r.isPresent()) { + return r; + } else { + logger.debug("Reference could not be used, checking other model rules."); + } + } + + // handle RefSubordinationExtraction + if (extraction.get() instanceof RefSubordinationExtraction) { + Optional r = ((RefSubordinationExtraction) extraction.get()).convert(leaf); + if (r.isPresent()) { + return r; + } else { + logger.debug("Reference could not be used, checking other model rules."); + } + } + } + } + logger.debug("No model rule applied."); + + return Optional.empty(); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/Relation.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/Relation.java new file mode 100644 index 0000000..4676e40 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/Relation.java @@ -0,0 +1,75 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Relation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree; + +import java.util.Optional; + +public enum Relation { + + // default relations + UNKNOWN_COORDINATION, // the default for coordination + UNKNOWN_SUBORDINATION, // the default for subordination + UNKNOWN_SENT_SIM, // the default for sentence simplification + + BACKGROUND, + CAUSE, + CONDITION, + CONTRAST, + ELABORATION, + ENABLEMENT, + EXPLANATION, + JOINT_LIST, + JOINT_DISJUNCTION, + TEMPORAL_BEFORE, + TEMPORAL_AFTER, + TEMPORAL_SEQUENCE, + + // special relations + INTRA_SENTENTIAL_ATTRIBUTION, + JOINT_NP_LIST, + JOINT_NP_DISJUNCTION, + + // sentence simplification + TIME, + LOCATION; + + static { + TEMPORAL_AFTER.reverseRelation = TEMPORAL_BEFORE; + TEMPORAL_BEFORE.reverseRelation = TEMPORAL_AFTER; + } + + private Relation reverseRelation; + + Relation() { + /* + * by default, each relation is bidirectional with an equal reverse relation. + * To make a relation unidirectional, set reverseRelation to null. + */ + this.reverseRelation = this; + } + + + public Optional getReverseRelation() { + return Optional.ofNullable(reverseRelation); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/classification/SignalPhraseClassifier.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/classification/SignalPhraseClassifier.java new file mode 100644 index 0000000..41afd82 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/classification/SignalPhraseClassifier.java @@ -0,0 +1,167 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SignalPhraseClassifier + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.classification; + +import edu.stanford.nlp.ling.Word; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class SignalPhraseClassifier { + + private static final List GENERAL_MAPPINGS = Arrays.asList( + + // BACKGROUND + new Mapping(Relation.BACKGROUND, "...as..."), + new Mapping(Relation.BACKGROUND, "...now..."), + new Mapping(Relation.BACKGROUND, "...once..."), +// new Mapping(Relation.BACKGROUND, "previously"), + new Mapping(Relation.BACKGROUND, "...when..."), + new Mapping(Relation.BACKGROUND, "...with..."), + new Mapping(Relation.BACKGROUND, "...without..."), + + // CAUSE + new Mapping(Relation.CAUSE, "...largely because..."), + new Mapping(Relation.CAUSE, "...as a result..."), + new Mapping(Relation.CAUSE, "...as a result of..."), + new Mapping(Relation.CAUSE, "...because..."), + new Mapping(Relation.CAUSE, "...since..."), + + // CONDITION + new Mapping(Relation.CONDITION, "...if..."), + new Mapping(Relation.CONDITION, "...in case..."), + new Mapping(Relation.CONDITION, "...unless..."), + new Mapping(Relation.CONDITION, "...until..."), + + // CONTRAST + new Mapping(Relation.CONTRAST, "...although..."), + new Mapping(Relation.CONTRAST, "...but..."), + new Mapping(Relation.CONTRAST, "...but now..."), + new Mapping(Relation.CONTRAST, "...despite..."), + new Mapping(Relation.CONTRAST, "...even though..."), + new Mapping(Relation.CONTRAST, "...even when..."), + new Mapping(Relation.CONTRAST, "...however..."), + new Mapping(Relation.CONTRAST, "...instead..."), + new Mapping(Relation.CONTRAST, "...rather..."), + new Mapping(Relation.CONTRAST, "...still..."), + new Mapping(Relation.CONTRAST, "...though..."), + new Mapping(Relation.CONTRAST, "...thus..."), + new Mapping(Relation.CONTRAST, "...until recently..."), + new Mapping(Relation.CONTRAST, "...while..."), + new Mapping(Relation.CONTRAST, "...yet..."), + + // ELABORATION + new Mapping(Relation.ELABORATION, "...more provocatively..."), + new Mapping(Relation.ELABORATION, "...even before..."), + new Mapping(Relation.ELABORATION, "...for example..."), + new Mapping(Relation.ELABORATION, "...further..."), + new Mapping(Relation.ELABORATION, "...recently..."), + new Mapping(Relation.ELABORATION, "...since...now..."), + new Mapping(Relation.ELABORATION, "...so..."), + new Mapping(Relation.ELABORATION, "...so far..."), + new Mapping(Relation.ELABORATION, "...where..."), + new Mapping(Relation.ELABORATION, "...whereby..."), + new Mapping(Relation.ELABORATION, "...whether..."), + + // EXPLANATION + new Mapping(Relation.EXPLANATION, "...simply because..."), + new Mapping(Relation.EXPLANATION, "...because of..."), + new Mapping(Relation.EXPLANATION, "...indeed..."), + new Mapping(Relation.EXPLANATION, "...so...that..."), + + // JOINT_LIST + new Mapping(Relation.JOINT_LIST, "...and..."), + + // JOINT_DISJUNCTION + new Mapping(Relation.JOINT_DISJUNCTION, "...or..."), + + // TEMPORAL_BEFORE + new Mapping(Relation.TEMPORAL_BEFORE, "...before..."), + new Mapping(Relation.TEMPORAL_BEFORE, "...previously..."), // changed from BACKGROUND TO TEMPORAL_BEFORE + + // TEMPORAL_AFTER + new Mapping(Relation.TEMPORAL_AFTER, "...after..."), + new Mapping(Relation.TEMPORAL_AFTER, "...and after..."), + new Mapping(Relation.TEMPORAL_AFTER, "...next..."), + new Mapping(Relation.TEMPORAL_AFTER, "...then..."), + + // TEMPORAL_SEQUENCE + new Mapping(Relation.TEMPORAL_SEQUENCE, "...thereafter...") + ); + + private static Optional classify(List mappings, List signalPhraseWords) { + String signalPhrase = WordsUtils.wordsToString(signalPhraseWords); + + Optional bestMapping = Optional.empty(); + for (Mapping mapping : mappings) { + if (mapping.check(signalPhrase)) { + if (!bestMapping.isPresent()) { + bestMapping = Optional.of(mapping); + } else if (mapping.getSignalPhrasePatternSize() >= bestMapping.get().getSignalPhrasePatternSize()) { + bestMapping = Optional.of(mapping); + } + } + } + + return bestMapping.map(Mapping::getRelation); + } + + public static Optional classifyCustom(List mappings, List signalPhraseWords) { + return classify(mappings, signalPhraseWords); + } + + public static Optional classifyGeneral(List signalPhraseWords) { + return classify(GENERAL_MAPPINGS, signalPhraseWords); + } + + public static class Mapping { + private final Relation relation; + + private final String signalPhrasePattern; // optional + private final int signalPhrasePatternSize; + + public Mapping(Relation relation, String signalPhrasePattern) { + this.relation = relation; + this.signalPhrasePattern = "^" + signalPhrasePattern.replaceAll("\\.\\.\\.", "((?<=^)(.*\\\\W)?|\\\\W|\\\\W.*\\\\W|(\\\\W.*)?(?=\\$))") + "$"; + this.signalPhrasePatternSize = signalPhrasePattern.length(); + } + + boolean check(String signalPhrase) { + return signalPhrase.toLowerCase().matches(signalPhrasePattern); + } + + public Relation getRelation() { + return relation; + } + + int getSignalPhrasePatternSize() { + return signalPhrasePatternSize; + } + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/Extraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/Extraction.java new file mode 100644 index 0000000..201dfb5 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/Extraction.java @@ -0,0 +1,30 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Extraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction; + +/** + * + */ +public abstract class Extraction { + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/ExtractionRule.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/ExtractionRule.java new file mode 100644 index 0000000..51653f7 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/ExtractionRule.java @@ -0,0 +1,153 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ExtractionRule + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +public abstract class ExtractionRule { + protected final Logger logger = LoggerFactory.getLogger(getClass()); + protected final TregexPattern pattern; + + public ExtractionRule(String pattern) { + this.pattern = TregexPattern.compile(pattern); + } + + protected static boolean isInfinitival(Tree clauseParseTree) { + TregexPattern p = TregexPattern.compile(clauseParseTree.value() + " <<, VP <<, /(T|t)o/"); + TregexMatcher matcher = p.matcher(clauseParseTree); + + return (matcher.findAt(clauseParseTree)); + } + + protected static List getSiblings(Tree parseTree, List tags) { + return parseTree.getChildrenAsList().stream().filter(c -> tags.contains(c.value())).collect(Collectors.toList()); + } + + private static Tense getTense(Tree vp) { + Tense res = Tense.PRESENT; + + // find past tense + TregexPattern p = TregexPattern.compile("VBD|VBN"); + TregexMatcher matcher = p.matcher(vp); + + if (matcher.find()) { + res = Tense.PAST; + } + + return res; + } + + private static List appendWordsFromTree(List words, Tree tree) { + List res = new ArrayList<>(); + res.addAll(words); + + TregexPattern p = TregexPattern.compile(tree.value() + " <<, NNP|NNPS"); + TregexMatcher matcher = p.matcher(tree); + + boolean isFirst = true; + for (Word word : tree.yieldWords()) { + if ((isFirst) && (!matcher.findAt(tree))) { + res.add(WordsUtils.lowercaseWord(word)); + } else { + res.add(word); + } + isFirst = false; + } + + return res; + } + + // pp is optional + protected static List rephraseIntraSententialAttribution(List words) { + try { + List res = new ArrayList<>(); + + Tree parseTree = ParseTreeParser.parse(WordsUtils.wordsToProperSentenceString(words)); + + TregexPattern p = TregexPattern.compile("ROOT <: (S < (NP=np ?$,, PP=pp $.. VP=vp))"); + TregexMatcher matcher = p.matcher(parseTree); + if (matcher.findAt(parseTree)) { + Tree pp = matcher.getNode("pp"); // optional + Tree np = matcher.getNode("np"); + Tree vp = matcher.getNode("vp"); + + Tense tense = getTense(vp); + if (tense.equals(Tense.PRESENT)) { + res.add(new Word("This")); + res.add(new Word("is")); + res.add(new Word("what")); + } else { + res.add(new Word("This")); + res.add(new Word("was")); + res.add(new Word("what")); + } + res = appendWordsFromTree(res, np); + res = appendWordsFromTree(res, vp); + if (pp != null) { + res = appendWordsFromTree(res, pp); + } + } + + return res; + } catch (ParseTreeException e) { + return words; + } + } + + protected static List rephraseEnablement(Tree s, Tree vp) { + List res = new ArrayList<>(); + + Tense tense = getTense(vp); + if (tense.equals(Tense.PRESENT)) { + res.add(new Word("This")); + res.add(new Word("is")); + } else { + res.add(new Word("This")); + res.add(new Word("was")); + } + res = appendWordsFromTree(res, s); + + return res; + } + + public abstract Optional extract(Tree parseTree); + + protected enum Tense { + PRESENT, + PAST + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/CoordinationExtraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/CoordinationExtraction.java new file mode 100644 index 0000000..1d52a60 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/CoordinationExtraction.java @@ -0,0 +1,75 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : CoordinationExtraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.model; + +import edu.stanford.nlp.ling.Word; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.model.Coordination; +import org.lambda3.text.simplification.discourse.tree.model.DiscourseTree; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * + */ +public class CoordinationExtraction extends Extraction { + private final String extractionRule; + private final Relation relation; + private final String signalPhrase; // optional + private final List coordinations; + private final Leaf.Type coordinationsType; + + public CoordinationExtraction(String extractionRule, Relation relation, List> coordinationsWords, Leaf.Type coordinationsType) { + this.extractionRule = extractionRule; + this.relation = relation; + this.signalPhrase = null; + this.coordinations = coordinationsWords.stream().map(WordsUtils::wordsToProperSentenceString).collect(Collectors.toList()); + this.coordinationsType = coordinationsType; + } + + // binary + public CoordinationExtraction(String extractionRule, Relation relation, List signalPhraseWords, List leftCoordinationWords, List rightCoordinationWords, Leaf.Type coordinationsType) { + this.extractionRule = extractionRule; + this.relation = relation; + this.signalPhrase = (signalPhraseWords != null) ? WordsUtils.wordsToString(signalPhraseWords) : null; + this.coordinations = Stream.of(leftCoordinationWords, rightCoordinationWords).map(WordsUtils::wordsToProperSentenceString).collect(Collectors.toList()); + this.coordinationsType = coordinationsType; + } + + public DiscourseTree convert() { + return new Coordination( + extractionRule, + relation, + signalPhrase, + coordinations.stream().map( + s -> new Leaf(coordinationsType, extractionRule, s) + ).collect(Collectors.toList()) + ); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefCoordinationExtraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefCoordinationExtraction.java new file mode 100644 index 0000000..a228934 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefCoordinationExtraction.java @@ -0,0 +1,81 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : RefCoordinationExtraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.model; + +import edu.stanford.nlp.ling.Word; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.model.Coordination; +import org.lambda3.text.simplification.discourse.tree.model.DiscourseTree; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class RefCoordinationExtraction extends Extraction { + private final String extractionRule; + private final Relation relation; + private final String signalPhrase; // optional + private final String rightCoordination; + private final Leaf.Type rightCoordinationType; + + // binary + public RefCoordinationExtraction(String extractionRule, Relation relation, List signalPhraseWords, List rightCoordinationWords, Leaf.Type rightCoordinationType) { + this.extractionRule = extractionRule; + this.relation = relation; + this.signalPhrase = (signalPhraseWords != null) ? WordsUtils.wordsToString(signalPhraseWords) : null; + this.rightCoordination = WordsUtils.wordsToProperSentenceString(rightCoordinationWords); + this.rightCoordinationType = rightCoordinationType; + } + + public Optional convert(Leaf currChild) { + + // find previous node to use as a reference + Optional prevNode = currChild.getPreviousNode(); + if ((prevNode.isPresent()) && (prevNode.get().usableAsReference())) { + + // use prev node as a reference + prevNode.get().useAsReference(); + + Coordination res = new Coordination( + extractionRule, + relation, + signalPhrase, + Collections.emptyList() + ); + res.addCoordination(prevNode.get()); // set prev node as a reference + res.addCoordination(new Leaf(rightCoordinationType, extractionRule, rightCoordination)); + + return Optional.of(res); + + } + + return Optional.empty(); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefSubordinationExtraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefSubordinationExtraction.java new file mode 100644 index 0000000..9509bf6 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/RefSubordinationExtraction.java @@ -0,0 +1,81 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : RefSubordinationExtraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.model; + +import edu.stanford.nlp.ling.Word; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.model.DiscourseTree; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.tree.model.Subordination; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class RefSubordinationExtraction extends Extraction { + private final String extractionRule; + private final Relation relation; + private final String signalPhrase; // optional + private final String rightConstituent; + private final boolean superordinationIsLeft; + private final Leaf.Type rightConstituentType; + + // binary + public RefSubordinationExtraction(String extractionRule, Relation relation, List signalPhraseWords, List rightConstituentWords, boolean superordinationIsLeft, Leaf.Type rightConstituentType) { + this.extractionRule = extractionRule; + this.relation = relation; + this.signalPhrase = (signalPhraseWords != null) ? WordsUtils.wordsToString(signalPhraseWords) : null; + this.rightConstituent = WordsUtils.wordsToProperSentenceString(rightConstituentWords); + this.superordinationIsLeft = superordinationIsLeft; + this.rightConstituentType = rightConstituentType; + } + + public Optional convert(Leaf currChild) { + + // find previous node to use as a reference + Optional prevNode = currChild.getPreviousNode(); + if ((prevNode.isPresent()) && (prevNode.get().usableAsReference())) { + + // use prev node as a reference + prevNode.get().useAsReference(); + + Subordination res = new Subordination( + extractionRule, + relation, + signalPhrase, + new Leaf(Leaf.Type.DEFAULT, extractionRule, "tmp"), + new Leaf(rightConstituentType, extractionRule, rightConstituent), + superordinationIsLeft + ); + res.replaceLeftConstituent(prevNode.get()); // set prev node as a reference + + return Optional.of(res); + } + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/SubordinationExtraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/SubordinationExtraction.java new file mode 100644 index 0000000..72cbe8d --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/model/SubordinationExtraction.java @@ -0,0 +1,70 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SubordinationExtraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.model; + +import edu.stanford.nlp.ling.Word; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.model.DiscourseTree; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.tree.model.Subordination; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.List; + +/** + * + */ +public class SubordinationExtraction extends Extraction { + private final String extractionRule; + private final Relation relation; + private final String signalPhrase; // optional + private final String leftConstituent; + private final String rightConstituent; + private final boolean superordinationIsLeft; + private final Leaf.Type leftConstituentType; + private final Leaf.Type rightConstituentType; + + // binary + public SubordinationExtraction(String extractionRule, Relation relation, List signalPhraseWords, List leftConstituentWords, List rightConstituentWords, boolean superordinationIsLeft, Leaf.Type leftConstituentType, Leaf.Type rightConstituentType) { + this.extractionRule = extractionRule; + this.relation = relation; + this.signalPhrase = (signalPhraseWords != null) ? WordsUtils.wordsToString(signalPhraseWords) : null; + this.leftConstituent = WordsUtils.wordsToProperSentenceString(leftConstituentWords); + this.rightConstituent = WordsUtils.wordsToProperSentenceString(rightConstituentWords); + this.superordinationIsLeft = superordinationIsLeft; + this.leftConstituentType = leftConstituentType; + this.rightConstituentType = rightConstituentType; + } + + public DiscourseTree convert() { + return new Subordination( + extractionRule, + relation, + signalPhrase, + new Leaf(leftConstituentType, extractionRule, leftConstituent), + new Leaf(rightConstituentType, extractionRule, rightConstituent), + superordinationIsLeft + ); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/CoordinationExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/CoordinationExtractor.java new file mode 100644 index 0000000..c3ba166 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/CoordinationExtractor.java @@ -0,0 +1,110 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : CoordinationExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.classification.SignalPhraseClassifier; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.CoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +public class CoordinationExtractor extends ExtractionRule { + + private final String node1; + private final String node2; + + public CoordinationExtractor() { + this("ROOT <<: (S=s < (S $.. S))", "s", "S"); + } + + CoordinationExtractor(String pattern, String node1, String node2) { + super(pattern); + this.node1 = node1; + this.node2 = node2; + } + + private List> combineSiblings(List precedingWords, List followingWords, List siblings) { + List> constituentsWords = new ArrayList<>(); + for (Tree sibling : siblings) { + List constituentWords = new ArrayList<>(); + + constituentWords.addAll(precedingWords); + constituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(sibling)); + constituentWords.addAll(followingWords); + + constituentsWords.add(constituentWords); + } + + return constituentsWords; + } + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + List siblings = getSiblings(matcher.getNode(node1), Collections.singletonList(node2)); + + // constituents + List precedingWords = ParseTreeExtractionUtils.getPrecedingWords(parseTree, siblings.get(0), false); + List followingWords = ParseTreeExtractionUtils.getFollowingWords(parseTree, siblings.get(siblings.size() - 1), false); + List> constituentsWords = combineSiblings(precedingWords, followingWords, siblings); + + // result + if (constituentsWords.size() == 2) { + List signalPhraseWords = ParseTreeExtractionUtils.getWordsInBetween(parseTree, siblings.get(0), siblings.get(siblings.size() - 1), false, false); + Optional relation = SignalPhraseClassifier.classifyGeneral(signalPhraseWords); + + return Optional.of(new CoordinationExtraction( + getClass().getSimpleName(), + relation.orElse(Relation.UNKNOWN_COORDINATION), + signalPhraseWords, + constituentsWords.get(0), + constituentsWords.get(constituentsWords.size() - 1), + Leaf.Type.DEFAULT) + ); + } else { + return Optional.of(new CoordinationExtraction( + getClass().getSimpleName(), + Relation.UNKNOWN_COORDINATION, + constituentsWords, + Leaf.Type.DEFAULT) + ); + } + } + + return Optional.empty(); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/IntraSententialSubordinationExtraction.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/IntraSententialSubordinationExtraction.java new file mode 100644 index 0000000..4c9f468 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/IntraSententialSubordinationExtraction.java @@ -0,0 +1,114 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : IntraSententialSubordinationExtraction + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.classification.SignalPhraseClassifier; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.model.SubordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +public class IntraSententialSubordinationExtraction extends SubordinationExtractor { + // custom mappings + private static final List INTRA_SENTENTIAL_ATTRIBUTION_MAPPINGS = Arrays.asList( + + // INTRA_SENTENTIAL_ATTRIBUTION + new SignalPhraseClassifier.Mapping(Relation.INTRA_SENTENTIAL_ATTRIBUTION, ""), + new SignalPhraseClassifier.Mapping(Relation.INTRA_SENTENTIAL_ATTRIBUTION, "...that...") + ); + + + public IntraSententialSubordinationExtraction() { + super("ROOT <<: (S < (NP $.. (VP=vp <+(VP) (SBAR=sbar < (S=s)))))"); + } + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + List signalPhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); + + // the left, (usually) superordinate constituent + List leftConstituentWords = new ArrayList<>(); + leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(parseTree, matcher.getNode("sbar"), false)); + leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(parseTree, matcher.getNode("sbar"), false)); + + // the right, (usually) subordinate constituent + List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); + + // result + Optional relation; + boolean superordinationIsLeft = true; + Leaf.Type leftConstituentType = Leaf.Type.DEFAULT; + Leaf.Type rightConstituentType = Leaf.Type.DEFAULT; + + // intra sentential attribution + relation = SignalPhraseClassifier.classifyCustom(INTRA_SENTENTIAL_ATTRIBUTION_MAPPINGS, signalPhraseWords); + if (relation.isPresent()) { + leftConstituentWords = rephraseIntraSententialAttribution(leftConstituentWords); + leftConstituentType = Leaf.Type.SENT_SIM_CONTEXT; + + // swap superordinate with subordinate assignment + superordinationIsLeft = false; + } + + // enablement + if (!relation.isPresent()) { + if (isInfinitival(matcher.getNode("s"))) { + relation = Optional.of(Relation.ENABLEMENT); + rightConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); + rightConstituentType = Leaf.Type.SENT_SIM_CONTEXT; + } + } + + // general + if (!relation.isPresent()) { + relation = SignalPhraseClassifier.classifyGeneral(signalPhraseWords); + } + + return Optional.of(new SubordinationExtraction( + getClass().getSimpleName(), + relation.orElse(Relation.UNKNOWN_SUBORDINATION), + signalPhraseWords, + leftConstituentWords, // the superordinate constituent + rightConstituentWords, // the subordinate constituent + superordinationIsLeft, + leftConstituentType, + rightConstituentType)); + } + + return Optional.empty(); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/LeftSubordinateEnablementExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/LeftSubordinateEnablementExtractor.java new file mode 100644 index 0000000..7d03be8 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/LeftSubordinateEnablementExtractor.java @@ -0,0 +1,33 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : LeftSubordinateEnablementExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +/** + * + */ +public class LeftSubordinateEnablementExtractor extends SubordinationEnablementExtractor { + + public LeftSubordinateEnablementExtractor() { + super("ROOT <<: (S < (S=s $.. (NP $.. VP=vp)))"); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ListNPExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ListNPExtractor.java new file mode 100644 index 0000000..ea90d34 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ListNPExtractor.java @@ -0,0 +1,90 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ListNPExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.CoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.extraction.utils.ListNPSplitter; +import org.lambda3.text.simplification.discourse.tree.extraction.utils.TregexUtils; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class ListNPExtractor extends ExtractionRule { + + public ListNPExtractor(String pattern) { + super(pattern); + } + + @Override + public Optional extract(Tree parseTree) { + + List matches = TregexUtils.sortedFindAt(parseTree, pattern, Collections.singletonList("np")); + if (matches.size() > 0) { + TregexUtils.MyMatch match = matches.get(0); + + Optional r = ListNPSplitter.split(match.getNode("np")); + if (r.isPresent()) { + + // constituents + List precedingWords = ParseTreeExtractionUtils.getPrecedingWords(parseTree, match.getNode("np"), false); + List followingWords = ParseTreeExtractionUtils.getFollowingWords(parseTree, match.getNode("np"), false); + List> constituentsWords = new ArrayList<>(); + + + for (List element : r.get().getElementsWords()) { + List constituentWords = new ArrayList<>(); + + constituentWords.addAll(precedingWords); + constituentWords.addAll(element); + constituentWords.addAll(followingWords); + + constituentsWords.add(constituentWords); + } + + // result + Extraction res = new CoordinationExtraction( + getClass().getSimpleName(), + r.get().getRelation(), + constituentsWords, + Leaf.Type.TERMINAL + ); + + return Optional.of(res); + } + } + + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForContainingWords.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForContainingWords.java new file mode 100644 index 0000000..35e1864 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForContainingWords.java @@ -0,0 +1,77 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ReferenceExtractorForContainingWords + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.classification.SignalPhraseClassifier; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.RefCoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class ReferenceExtractorForContainingWords extends ExtractionRule { + + public ReferenceExtractorForContainingWords() { + super("ROOT <<: S <<, (__=node >1 S <<: (__=leaf !< __))"); + } + + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + List signalPhraseWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("leaf")); + + // the right constituent + List rightConstituentWords = ParseTreeExtractionUtils.getFollowingWords(parseTree, matcher.getNode("node"), false); + + // result + Optional relation = SignalPhraseClassifier.classifyGeneral(signalPhraseWords); + if (relation.isPresent()) { + Extraction res = new RefCoordinationExtraction( + getClass().getSimpleName(), + relation.get(), + signalPhraseWords, + rightConstituentWords, + Leaf.Type.DEFAULT + ); + + return Optional.of(res); + } + } + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForPrecedingWords.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForPrecedingWords.java new file mode 100644 index 0000000..76817a8 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/ReferenceExtractorForPrecedingWords.java @@ -0,0 +1,76 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ReferenceExtractorForPrecedingWords + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.classification.SignalPhraseClassifier; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.RefCoordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.List; +import java.util.Optional; + +/** + * + */ +public class ReferenceExtractorForPrecedingWords extends ExtractionRule { + + public ReferenceExtractorForPrecedingWords() { + super("ROOT <<: S <<, (__=node >1 S << /this|that/=det)"); + } + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + List signalPhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("node"), matcher.getNode("det"), true); + + // the right constituent + List rightConstituentWords = ParseTreeExtractionUtils.getFollowingWords(parseTree, matcher.getNode("node"), false); + + // result + Optional relation = SignalPhraseClassifier.classifyGeneral(signalPhraseWords); + if (relation.isPresent()) { + Extraction res = new RefCoordinationExtraction( + getClass().getSimpleName(), + relation.get(), + signalPhraseWords, + rightConstituentWords, + Leaf.Type.DEFAULT + ); + + return Optional.of(res); + } + } + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/RightSubordinateEnablementExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/RightSubordinateEnablementExtractor.java new file mode 100644 index 0000000..092a50f --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/RightSubordinateEnablementExtractor.java @@ -0,0 +1,33 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : RightSubordinateEnablementExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +/** + * + */ +public class RightSubordinateEnablementExtractor extends SubordinationEnablementExtractor { + + public RightSubordinateEnablementExtractor() { + super("ROOT <<: (S < (NP $.. (VP=vp <+(VP) (NP|PP $.. (S=s)))))"); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SharedNPCoordinationExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SharedNPCoordinationExtractor.java new file mode 100644 index 0000000..132cb19 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SharedNPCoordinationExtractor.java @@ -0,0 +1,30 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SharedNPCoordinationExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +public class SharedNPCoordinationExtractor extends CoordinationExtractor { + + public SharedNPCoordinationExtractor() { + super("ROOT <<: (S < (NP $.. (VP=vp < (VP $..VP))))", "vp", "VP"); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationEnablementExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationEnablementExtractor.java new file mode 100644 index 0000000..f6c961d --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationEnablementExtractor.java @@ -0,0 +1,101 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SubordinationEnablementExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.SubordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * + */ +abstract class SubordinationEnablementExtractor extends ExtractionRule { + + SubordinationEnablementExtractor(String pattern) { + super(pattern); + } + + private List getSuperordinateConstituentWords(Tree parseTree, Tree node) { + List constituentWords = new ArrayList<>(); + constituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(parseTree, node, false)); + constituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(parseTree, node, false)); + return constituentWords; + } + + private List getSubordinateConstituentWords(Tree node) { + return ParseTreeExtractionUtils.getContainingWords(node); + } + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + + List superordinateConstituentWords; + List subordinateConstituentWords; + + superordinateConstituentWords = getSuperordinateConstituentWords(parseTree, matcher.getNode("s")); + subordinateConstituentWords = getSubordinateConstituentWords(matcher.getNode("s")); + + // result + Optional relation = Optional.empty(); + Leaf.Type leftConstituentType = Leaf.Type.DEFAULT; + Leaf.Type rightConstituentType = Leaf.Type.DEFAULT; + + // enablement + if (isInfinitival(matcher.getNode("s"))) { + relation = Optional.of(Relation.ENABLEMENT); + subordinateConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); + rightConstituentType = Leaf.Type.SENT_SIM_CONTEXT; + } + + if (relation.isPresent()) { + return Optional.of( + new SubordinationExtraction( + getClass().getSimpleName(), + relation.get(), + null, + superordinateConstituentWords, // the superordinate constituent + subordinateConstituentWords, // the subordinate constituent + true, + leftConstituentType, + rightConstituentType) + ); + } + } + + return Optional.empty(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationExtractor.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationExtractor.java new file mode 100644 index 0000000..2585678 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/rules/SubordinationExtractor.java @@ -0,0 +1,97 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SubordinationExtractor + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.rules; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.tree.classification.SignalPhraseClassifier; +import org.lambda3.text.simplification.discourse.tree.extraction.Extraction; +import org.lambda3.text.simplification.discourse.tree.extraction.ExtractionRule; +import org.lambda3.text.simplification.discourse.tree.extraction.model.SubordinationExtraction; +import org.lambda3.text.simplification.discourse.tree.model.Leaf; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class SubordinationExtractor extends ExtractionRule { + + SubordinationExtractor(String pattern) { + super(pattern); + } + + public SubordinationExtractor() { + this("ROOT <<: (S < (SBAR=sbar < (S=s) $.. (NP $.. VP=vp)))"); + } + + @Override + public Optional extract(Tree parseTree) { + + TregexMatcher matcher = pattern.matcher(parseTree); + + if (matcher.findAt(parseTree)) { + List signalPhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); + + // the left, subordinate constituent + List leftConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); + + // the right, superordinate constituent + List rightConstituentWords = new ArrayList<>(); + rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(parseTree, matcher.getNode("sbar"), false)); + rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(parseTree, matcher.getNode("sbar"), false)); + + // result + Optional relation = Optional.empty(); + Leaf.Type leftConstituentType = Leaf.Type.DEFAULT; + Leaf.Type rightConstituentType = Leaf.Type.DEFAULT; + + // enablement + if (isInfinitival(matcher.getNode("s"))) { + relation = Optional.of(Relation.ENABLEMENT); + leftConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); + leftConstituentType = Leaf.Type.SENT_SIM_CONTEXT; + } + + // general + if (!relation.isPresent()) { + relation = SignalPhraseClassifier.classifyGeneral(signalPhraseWords); + } + + return Optional.of(new SubordinationExtraction( + getClass().getSimpleName(), + relation.orElse(Relation.UNKNOWN_SUBORDINATION), + signalPhraseWords, + leftConstituentWords, // the subordinate constituent + rightConstituentWords, // the superordinate constituent + false, + leftConstituentType, + rightConstituentType)); + } + + return Optional.empty(); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/ListNPSplitter.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/ListNPSplitter.java new file mode 100644 index 0000000..aafb73e --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/ListNPSplitter.java @@ -0,0 +1,99 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ListNPSplitter + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.utils; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.tree.Relation; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * + */ +public class ListNPSplitter { + + public static Optional split(Tree np) { + + // representation + String representation = np.getChildrenAsList().stream().map(c -> (c.value().equals("CC")) ? c.yieldWords().get(0).value() : c.value()).collect(Collectors.joining("")); + + final String LIST_CONJUNCTION_PATTERN = "^(NP|,)*NP(NP|,)*(and(NP|,)*NP(NP|,)*)+$"; + final String LIST_DISJUNCTION_PATTERN = "^(NP|,)*NP(NP|,)*(or(NP|,)*NP(NP|,)*)+$"; + if (representation.matches(LIST_CONJUNCTION_PATTERN) || representation.matches(LIST_DISJUNCTION_PATTERN)) { + Relation relation = representation.matches(LIST_CONJUNCTION_PATTERN) ? Relation.JOINT_NP_LIST : Relation.JOINT_NP_DISJUNCTION; + + // get last CC index + int lastCCIdx = 0; + for (int i = np.getChildrenAsList().size() - 1; i >= 0; i--) { + Tree child = np.getChildrenAsList().get(i); + if (child.value().equals("CC")) { + lastCCIdx = i; + break; + } + } + + // extract + List> elementsWords = new ArrayList<>(); + boolean foundFirstNPAfterCC = false; + for (int i = 0; i < np.getChildrenAsList().size(); i++) { + Tree child = np.getChildrenAsList().get(i); + + if (foundFirstNPAfterCC) { + elementsWords.get(elementsWords.size() - 1).addAll(child.yieldWords()); + } else if (child.value().equals("NP")) { + elementsWords.add(child.yieldWords()); + if (i > lastCCIdx) { + foundFirstNPAfterCC = true; + } + } + } + + return Optional.of(new Result(elementsWords, relation)); + } + + return Optional.empty(); + } + + public static class Result { + private final List> elementsWords; + private final Relation relation; + + public Result(List> elementsWords, Relation relation) { + this.elementsWords = elementsWords; + this.relation = relation; + } + + public List> getElementsWords() { + return elementsWords; + } + + public Relation getRelation() { + return relation; + } + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/TregexUtils.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/TregexUtils.java new file mode 100644 index 0000000..7132b5b --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/extraction/utils/TregexUtils.java @@ -0,0 +1,111 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : TregexUtils + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.extraction.utils; + +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * + */ +public class TregexUtils { + + public static List sortedFindAt(Tree parseTree, TregexPattern p, List groupsToOrder) { + List res = new ArrayList<>(); + + TregexMatcher matcher = p.matcher(parseTree); + while (matcher.findAt(parseTree)) { + HashMap groups = new HashMap<>(); + for (String name : matcher.getNodeNames()) { + groups.put(name, matcher.getNode(name)); + } + res.add(new MyMatch(groups)); + } + + // sort groups + res.sort(new MyMatch.Comparator(parseTree, groupsToOrder)); + + return res; + } + + public static List sortedFind(Tree parseTree, TregexPattern p, List groupsToOrder) { + List res = new ArrayList<>(); + + TregexMatcher matcher = p.matcher(parseTree); + while (matcher.find()) { + HashMap groups = new HashMap<>(); + for (String name : matcher.getNodeNames()) { + groups.put(name, matcher.getNode(name)); + } + res.add(new MyMatch(groups)); + } + + // sort groups + res.sort(new MyMatch.Comparator(parseTree, groupsToOrder)); + + return res; + } + + public static class MyMatch { + private final HashMap groups; + + public MyMatch(HashMap groups) { + this.groups = groups; + } + + public Tree getNode(String name) { + if (groups.containsKey(name)) { + return groups.get(name); + } else { + throw new IllegalArgumentException("No tree for name: '" + name + "'"); + } + } + + public static class Comparator implements java.util.Comparator { + private final Tree anchorTree; + private final List names; + + public Comparator(Tree anchorTree, List names) { + this.anchorTree = anchorTree; + this.names = names; + } + + @Override + public int compare(MyMatch myMatch, MyMatch otherMatch) { + int myMatchValue = 0; + int otherMatchValue = 0; + for (String name : names) { + myMatchValue += myMatch.getNode(name).nodeNumber(anchorTree); + otherMatchValue += otherMatch.getNode(name).nodeNumber(anchorTree); + } + + return myMatchValue - otherMatchValue; + } + } + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Coordination.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Coordination.java new file mode 100644 index 0000000..e91e176 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Coordination.java @@ -0,0 +1,131 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Coordination + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +public class Coordination extends DiscourseTree { + final Relation relation; + private final String signalPhrase; // optional + private final List coordinations; + + public Coordination(String extractionRule, Relation relation, String signalPhrase, List coordinations) { + super(extractionRule); + this.relation = relation; + this.signalPhrase = signalPhrase; + this.coordinations = new ArrayList<>(); + coordinations.forEach(this::addCoordination); + } + + public void addCoordination(DiscourseTree coordination) { + this.coordinations.add(coordination); + coordination.parent = this; + } + + public void invalidateCoordination(DiscourseTree coordination) { + replaceCoordination(coordination, new Invalidation()); + } + + public void replaceCoordination(DiscourseTree oldCoordination, DiscourseTree newCoordination) { + for (int i = 0; i < coordinations.size(); i++) { + if (coordinations.get(i).equals(oldCoordination)) { + coordinations.set(i, newCoordination); + newCoordination.parent = this; + newCoordination.setRecursiveUnsetSentenceIdx(oldCoordination.getSentenceIdx()); + break; + } + } + } + + public void removeInvalidations() { + for (int i = coordinations.size() - 1; i >= 0; i--) { + if (coordinations.get(i) instanceof Invalidation) { + coordinations.remove(i); + } + } + } + + public Relation getRelation() { + return relation; + } + + public List getCoordinations() { + return coordinations; + } + + public List getOtherCoordinations(DiscourseTree coordination) { + return coordinations.stream().filter(c -> !c.equals(coordination)).collect(Collectors.toList()); + } + + public List getOtherPrecedingCoordinations(DiscourseTree coordination) { + List res = new ArrayList<>(); + + for (DiscourseTree child : coordinations) { + if (child.equals(coordination)) { + break; + } else { + res.add(child); + } + } + + return res; + } + + public List getOtherFollowingCoordinations(DiscourseTree coordination) { + List res = new ArrayList<>(); + + boolean found = false; + for (DiscourseTree child : coordinations) { + if (child.equals(coordination)) { + found = true; + } else { + if (found) { + res.add(child); + } + } + } + + return res; + } + + @Override + public List getPTPCaption() { + String signalPhraseStr = (signalPhrase != null) ? "'" + signalPhrase + "'" : "NULL"; + return Collections.singletonList("CO/" + relation + " (" + signalPhraseStr + ", " + extractionRule + ")"); + } + + @Override + public List getPTPEdges() { + return coordinations.stream().map(c -> new PrettyTreePrinter.DefaultEdge("n", c, true)).collect(Collectors.toList()); + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/DiscourseTree.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/DiscourseTree.java new file mode 100644 index 0000000..f0f3d9c --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/DiscourseTree.java @@ -0,0 +1,158 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : DiscourseTree + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public abstract class DiscourseTree implements PrettyTreePrinter.Node { + final String extractionRule; + DiscourseTree parent; //optional + private boolean processed; + private int sentenceIdx; + + DiscourseTree(String extractionRule) { + this.extractionRule = extractionRule; + this.processed = false; + this.parent = null; // should be set by inherited classes + this.sentenceIdx = -1; // should be set by inherited classes + } + + void setRecursiveUnsetSentenceIdx(int sentenceIdx) { + if (this.sentenceIdx < 0) { + this.sentenceIdx = sentenceIdx; + + // recursive + if (this instanceof Coordination) { + ((Coordination) this).getCoordinations().forEach(c -> c.setRecursiveUnsetSentenceIdx(sentenceIdx)); + } + if (this instanceof Subordination) { + ((Subordination) this).getLeftConstituent().setRecursiveUnsetSentenceIdx(sentenceIdx); + ((Subordination) this).getRightConstituent().setRecursiveUnsetSentenceIdx(sentenceIdx); + } + } + } + + public void cleanup() { + if (this instanceof Coordination) { + + // remove invalidations + ((Coordination) this).removeInvalidations(); + + // recursion + ((Coordination) this).getCoordinations().forEach(DiscourseTree::cleanup); + } + + if (this instanceof Subordination) { + + // recursion + ((Subordination) this).getLeftConstituent().cleanup(); + ((Subordination) this).getRightConstituent().cleanup(); + } + } + + public boolean usableAsReference() { + return ((parent != null) && (parent instanceof Coordination) && (((Coordination) parent).relation.equals(Relation.UNKNOWN_COORDINATION))); + } + + public void useAsReference() { + if (usableAsReference()) { + ((Coordination) parent).invalidateCoordination(this); + } else { + throw new AssertionError("Not useable as reference"); + } + } + + public List getNucleusPathLeaves() { + List res = new ArrayList<>(); + + if (this instanceof Leaf) { + res.add((Leaf) this); + } else { + // recursion on coordinations + if (this instanceof Coordination) { + for (DiscourseTree child : ((Coordination) this).getCoordinations()) { + res.addAll(child.getNucleusPathLeaves()); + } + } + + // recursion on superordinations + if (this instanceof Subordination) { + res.addAll(((Subordination) this).getSuperordination().getNucleusPathLeaves()); + } + } + + return res; + } + + public Optional getPreviousNode() { + if (parent != null) { + if (parent instanceof Coordination) { + Coordination p = (Coordination) parent; + DiscourseTree prev = null; + for (DiscourseTree child : p.getCoordinations()) { + if ((child.equals(this)) && (prev != null)) { + return Optional.of(prev); + } + prev = child; + } + } + if (parent instanceof Subordination) { + Subordination p = (Subordination) parent; + if (p.getRightConstituent().equals(this)) { + return Optional.of(p.getLeftConstituent()); + } + } + + // recursion + return parent.getPreviousNode(); + } + + return Optional.empty(); + } + + public void setProcessed() { + this.processed = true; + } + + public boolean isNotProcessed() { + return !processed; + } + + public String getExtractionRule() { + return extractionRule; + } + + public int getSentenceIdx() { + return sentenceIdx; + } + + @Override + public String toString() { + return PrettyTreePrinter.prettyPrint(this, false); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Invalidation.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Invalidation.java new file mode 100644 index 0000000..07d30bb --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Invalidation.java @@ -0,0 +1,49 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Invalidation + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class Invalidation extends DiscourseTree { + + public Invalidation() { + super(""); + } + + @Override + public List getPTPCaption() { + return Collections.singletonList("INVALIDATED"); + } + + @Override + public List getPTPEdges() { + return new ArrayList<>(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Leaf.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Leaf.java new file mode 100644 index 0000000..2049e85 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Leaf.java @@ -0,0 +1,75 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Leaf + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class Leaf extends DiscourseTree { + private final Type type; + private final String text; + + public Leaf(Type type, String extractionRule, String text) { + super(extractionRule); + this.type = type; + this.text = text; + } + + public Type getType() { + return type; + } + + public String getText() { + return text; + } + + @Override + public List getPTPCaption() { + String typeStr = ""; + if (type.equals(Type.TERMINAL)) { + typeStr = " [terminal]"; + } else if (type.equals(Type.SENT_SIM_CONTEXT)) { + typeStr = " [s-context]"; + } + + return Collections.singletonList("'" + text + "'" + typeStr); + } + + @Override + public List getPTPEdges() { + return new ArrayList<>(); + } + + public enum Type { + DEFAULT, // can be splitted + TERMINAL, // will not be splitted + SENT_SIM_CONTEXT // will not be splitted and will act like a sentence-simplification content in Step 3 + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/SentenceLeaf.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/SentenceLeaf.java new file mode 100644 index 0000000..777e6bf --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/SentenceLeaf.java @@ -0,0 +1,34 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SentenceLeaf + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +/** + * + */ +public class SentenceLeaf extends Leaf { + + public SentenceLeaf(String sentence, int sentenceIdx) { + super(Type.DEFAULT, "SENTENCE", sentence); + this.setRecursiveUnsetSentenceIdx(sentenceIdx); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Subordination.java b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Subordination.java new file mode 100644 index 0000000..e7dc77c --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/tree/model/Subordination.java @@ -0,0 +1,119 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : Subordination + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.tree.model; + +import org.lambda3.text.simplification.discourse.tree.Relation; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class Subordination extends DiscourseTree { + private final Relation relation; + private final String signalPhrase; // optional + private final boolean superordinationIsLeft; + private DiscourseTree leftConstituent; + private DiscourseTree rightConstituent; + + public Subordination(String extractionRule, Relation relation, String signalPhrase, DiscourseTree leftConstituent, DiscourseTree rightConstituent, boolean superordinationIsLeft) { + super(extractionRule); + this.relation = relation; + this.signalPhrase = signalPhrase; + this.superordinationIsLeft = superordinationIsLeft; + + this.leftConstituent = new Leaf(Leaf.Type.DEFAULT, "tmp", "tmp"); + this.rightConstituent = new Leaf(Leaf.Type.DEFAULT, "tmp", "tmp"); + replaceLeftConstituent(leftConstituent); + replaceRightConstituent(rightConstituent); + } + + public void replaceLeftConstituent(DiscourseTree newLeftConstituent) { + DiscourseTree oldLeftConstituent = this.leftConstituent; + this.leftConstituent = newLeftConstituent; + newLeftConstituent.parent = this; + newLeftConstituent.setRecursiveUnsetSentenceIdx(oldLeftConstituent.getSentenceIdx()); + } + + private void replaceRightConstituent(DiscourseTree newRightConstituent) { + DiscourseTree oldRightConstituent = this.rightConstituent; + this.rightConstituent = newRightConstituent; + newRightConstituent.parent = this; + newRightConstituent.setRecursiveUnsetSentenceIdx(oldRightConstituent.getSentenceIdx()); + } + + public void replaceSuperordination(DiscourseTree newSuperordination) { + if (superordinationIsLeft) { + replaceLeftConstituent(newSuperordination); + } else { + replaceRightConstituent(newSuperordination); + } + } + + public void replaceSubordination(DiscourseTree newSubordination) { + if (superordinationIsLeft) { + replaceRightConstituent(newSubordination); + } else { + replaceLeftConstituent(newSubordination); + } + } + + public Relation getRelation() { + return relation; + } + + public DiscourseTree getLeftConstituent() { + return leftConstituent; + } + + public DiscourseTree getRightConstituent() { + return rightConstituent; + } + + public DiscourseTree getSuperordination() { + return (superordinationIsLeft) ? leftConstituent : rightConstituent; + } + + public DiscourseTree getSubordination() { + return (superordinationIsLeft) ? rightConstituent : leftConstituent; + } + + @Override + public List getPTPCaption() { + String signalPhraseStr = (signalPhrase != null) ? "'" + signalPhrase + "'" : "NULL"; + return Collections.singletonList("SUB/" + relation + " (" + signalPhraseStr + ", " + extractionRule + ")"); + } + + @Override + public List getPTPEdges() { + List res = new ArrayList<>(); + res.add(new PrettyTreePrinter.DefaultEdge((superordinationIsLeft) ? "n" : "s", leftConstituent, true)); + res.add(new PrettyTreePrinter.DefaultEdge((superordinationIsLeft) ? "s" : "n", rightConstituent, true)); + + return res; + } + +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/IndexRange.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/IndexRange.java new file mode 100644 index 0000000..b13eafe --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/IndexRange.java @@ -0,0 +1,49 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : IndexRange + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils; + +/** + * + */ +public class IndexRange { + private final int fromIdx; + private final int toIdx; + + public IndexRange(int fromIdx, int toIdx) { + this.fromIdx = fromIdx; + this.toIdx = toIdx; + } + + public int getFromIdx() { + return fromIdx; + } + + public int getToIdx() { + return toIdx; + } + + @Override + public String toString() { + return "(" + fromIdx + " | " + toIdx + ")"; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/PrettyTreePrinter.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/PrettyTreePrinter.java new file mode 100644 index 0000000..dc1535a --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/PrettyTreePrinter.java @@ -0,0 +1,369 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : PrettyTreePrinter + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +public class PrettyTreePrinter { + + private static final String DOT_INDENT = " "; + + private static final NodeShape DEFAULT_NODE_SHAPE = NodeShape.box; + private static final List DEFAULT_NODE_STYLES = Collections.singletonList(NodeStyle.solid); + private static final String DEFAULT_NODE_COLOR = "black"; + private static final String DEFAULT_NODE_FILLCOLOR = "white"; + + private static final EdgeShape DEFAULT_EDGE_SHAPE = EdgeShape.box; + private static final List DEFAULT_EDGE_STYLES = Collections.singletonList(EdgeStyle.solid); + private static final String DEFAULT_EDGE_COLOR = "black"; + + // INTERFACES & CLASSES //////////////////////////////////////////////////////////////////////////////////////////// + + private static String trimText(String text, Integer maxTextLen) { + final String SUFFIX = "..."; + + if ((maxTextLen != null) && (text.length() > maxTextLen)) { + if (maxTextLen < SUFFIX.length()) { + throw new IllegalArgumentException("maxTextLen should have at least the length: " + SUFFIX.length()); + } + + return text.substring(0, maxTextLen - SUFFIX.length()) + SUFFIX; + } else { + return text; + } + } + + private static int getBottomDepth(Node node, boolean follow) { + if ((!follow) || (node.getPTPEdges().size() <= 0)) { + return 0; + } else { + OptionalInt max = node.getPTPEdges().stream().mapToInt(e -> getBottomDepth(e.getPTPChild(), e.followPTPChild())).max(); + return max.orElse(-1) + 1; + } + } + + private static String getEdgeIndent(int size, String edgeCaption, boolean lastChild) { + String front = (lastChild) ? "└─" : "├─"; + String back = "─> "; + + String middle = trimText(edgeCaption, size - front.length() - back.length()); + + boolean right = true; + while (front.length() + middle.length() + back.length() < size) { + middle = (right) ? middle + "─" : "─" + middle; + right = !right; + } + + return front + middle + back; + } + + private static String getIndent(int size, boolean lastChild) { + StringBuilder res = new StringBuilder((lastChild) ? " " : "|"); + while (res.length() < size) { + res.append(" "); + } + + return res.toString(); + } + + private static List prettyPrintRec(Node node, boolean follow, boolean reversed, int size) { + List res = new ArrayList<>(); + + int bottomDepth = getBottomDepth(node, follow); + + // this node + res.addAll(node.getPTPCaption()); + + // edges + if (follow) { + + ListIterator iter = (reversed) ? node.getPTPEdges().listIterator(node.getPTPEdges().size()) : node.getPTPEdges().listIterator(); + while ((reversed) ? iter.hasPrevious() : iter.hasNext()) { + Edge edge = (reversed) ? iter.previous() : iter.next(); + boolean endChild = ((reversed) ? !iter.hasPrevious() : !iter.hasNext()); + int indentSize = (bottomDepth - getBottomDepth(edge.getPTPChild(), edge.followPTPChild())) * size; + + boolean firstChildLine = true; + for (String childLine : prettyPrintRec(edge.getPTPChild(), edge.followPTPChild(), reversed, size)) { + if (firstChildLine) { + res.add(getEdgeIndent(indentSize, edge.getPTPCaption(), endChild) + childLine); + firstChildLine = false; + } else { + res.add(getIndent(indentSize, endChild) + childLine); + } + } + } + } + + return res; + } + + // GRAPHICAL ENUMS ///////////////////////////////////////////////////////////////////////////////////////////////// + + public static String prettyPrint(Node node, boolean reversed, int size) { + return prettyPrintRec(node, true, reversed, size).stream().collect(Collectors.joining("\n")); + } + + public static String prettyPrint(Node node, boolean reversed) { + return prettyPrint(node, reversed, 10); + } + + public static String prettyPrint(Node node, int size) { + return prettyPrint(node, false, size); + } + + public static String prettyPrint(Node node) { + return prettyPrintRec(node, true, false, 10).stream().collect(Collectors.joining("\n")); + } + + private static long addDotLineRec(Node node, boolean follow, StringBuilder strb, HashMap idMap) { + + long id; + if (idMap.containsKey(node)) { + id = idMap.get(node); + } else { + id = idMap.size(); + idMap.put(node, id); + } + + NodeShape nodeShape = (node instanceof GNode) ? ((GNode) node).getPTPNodeShape() : DEFAULT_NODE_SHAPE; + List nodeStyles = (node instanceof GNode) ? ((GNode) node).getPTPNodeStyles() : DEFAULT_NODE_STYLES; + String nodeColor = (node instanceof GNode) ? ((GNode) node).getPTPFillColor() : DEFAULT_NODE_COLOR; + String nodeFillColor = (node instanceof GNode) ? ((GNode) node).getPTPFillColor() : DEFAULT_NODE_FILLCOLOR; + String nodeLabel = node.getPTPCaption().stream().collect(Collectors.joining("\n")); + nodeLabel = nodeLabel.replaceAll("\\n", "\\\\n"); + + // this node + String nodeLine = String.format("\"%d\" [shape=\"%s\", style=\"%s\", color=\"%s\", fillcolor=\"%s\", label=\"%s\"];", + id, + nodeShape, + nodeStyles.stream().map(Enum::name).collect(Collectors.joining(",")), + nodeColor, + nodeFillColor, + nodeLabel + ); + strb.append(DOT_INDENT).append(nodeLine).append("\n"); + + // edges + if (follow) { + + for (Edge edge : node.getPTPEdges()) { + + // child (recursion) + long childId = addDotLineRec(edge.getPTPChild(), edge.followPTPChild(), strb, idMap); + + EdgeShape edgeShape = (edge instanceof GEdge) ? ((GEdge) edge).getPTPEdgeShape() : DEFAULT_EDGE_SHAPE; + List edgeStyles = (edge instanceof GEdge) ? ((GEdge) edge).getPTPEdgeStyles() : DEFAULT_EDGE_STYLES; + String edgeColor = (edge instanceof GEdge) ? ((GEdge) edge).getPTPColor() : DEFAULT_EDGE_COLOR; + String edgeLabel = edge.getPTPCaption(); + edgeLabel = edgeLabel.replaceAll("\\n", "\\\\n"); + + String edgeLine = String.format("\"%s\" -> \"%s\" [shape=\"%s\", style=\"%s\", color=\"%s\", label=\"%s\"];", + id, + childId, + edgeShape, + edgeStyles.stream().map(Enum::name).collect(Collectors.joining(",")), + edgeColor, + edgeLabel + ); + strb.append(DOT_INDENT).append(edgeLine).append("\n"); + } + } + + return id; + } + + // GENERAL FUNCTIONS /////////////////////////////////////////////////////////////////////////////////////////////// + + public static String visualize(Node node, String graphName, String title) { + StringBuilder strb = new StringBuilder(); + strb.append(String.format("digraph %s {", graphName)).append("\n"); + + if (title != null) { + strb.append(DOT_INDENT + "labelloc=\"t\";" + "\n"); + strb.append(DOT_INDENT).append(String.format("label=\"%s\";", title)).append("\n"); + } + + HashMap idMap = new HashMap<>(); + addDotLineRec(node, true, strb, idMap); + + List leafIDs = idMap.keySet().stream().filter(n -> n.getPTPEdges().size() <= 0).map(idMap::get).collect(Collectors.toList()); + String sameRankLine = String.format("{rank = same; %s};", leafIDs.stream().map(i -> "\"" + i + "\"").collect(Collectors.joining("; "))); + strb.append(DOT_INDENT).append(sameRankLine).append("\n"); + + strb.append("}"); + return strb.toString(); + } + + // TEXTUAL REPRESENTATION ////////////////////////////////////////////////////////////////////////////////////////// + + public static void visualizeToFile(Node node, String graphName, String title, String filepath) throws IOException { + String str = visualize(node, graphName, title); + + BufferedWriter writer; + writer = new BufferedWriter(new FileWriter(filepath)); + writer.write(str); + writer.close(); + } + + public enum NodeShape { + box, + polygon, + ellipse, + circle, + point, + egg, + triangle, + plaintext, + diamond, + trapezium, + parallelogram, + house, + pentagon, + hexagon, + septagon, + octagon, + doublecircle, + doubleoctagon, + tripleoctagon, + invtriangle, + invtrapezium, + invhouse, + Mdiamond, + Msquare, + Mcircle, + rect, + rectangle, + square, + none, + note, + tab, + folder, + box3d, + component + } + + public enum NodeStyle { + dashed, + dotted, + solid, + invis, + bold, + filled, + diagonals, + rounded + } + + public enum EdgeShape { + box, + crow, + diamond, + dot, + inv, + none, + normal, + tee, + vee + } + + public enum EdgeStyle { + dashed, + dotted, + solid, + invis, + bold + } + + public enum EdgeDir { + forward, + back, + both, + none + } + + public interface Node { + List getPTPCaption(); + + List getPTPEdges(); + } + + public interface Edge { + String getPTPCaption(); + + Node getPTPChild(); + + boolean followPTPChild(); + } + + // GRAPHICAL REPRESENTATION //////////////////////////////////////////////////////////////////////////////////////// + + public interface GNode extends Node { + NodeShape getPTPNodeShape(); + + List getPTPNodeStyles(); + + String getPTPColor(); + + String getPTPFillColor(); + } + + public interface GEdge extends Edge { + EdgeShape getPTPEdgeShape(); + + List getPTPEdgeStyles(); + + String getPTPColor(); + } + + public static class DefaultEdge implements Edge { + private final String caption; + private final Node child; + private final boolean followPTPChild; + + public DefaultEdge(String caption, Node child, boolean followPTPChild) { + this.caption = caption; + this.child = child; + this.followPTPChild = followPTPChild; + } + + @Override + public String getPTPCaption() { + return caption; + } + + @Override + public Node getPTPChild() { + return child; + } + + @Override + public boolean followPTPChild() { + return followPTPChild; + } + + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERExtractionUtils.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERExtractionUtils.java new file mode 100644 index 0000000..b14c417 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERExtractionUtils.java @@ -0,0 +1,44 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERExtractionUtils + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +import org.lambda3.text.simplification.discourse.utils.IndexRange; + +import java.util.ArrayList; +import java.util.List; + +/** + * + */ +public class NERExtractionUtils { + + public static List getNERIndexRanges(NERString nerString) { + List res = new ArrayList<>(); + + for (NERTokenGroup group : nerString.getGroups()) { + res.add(new IndexRange(group.getFromTokenIndex(), group.getToTokenIndex())); + } + + return res; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERString.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERString.java new file mode 100644 index 0000000..c4d0f51 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERString.java @@ -0,0 +1,86 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERString + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +import edu.stanford.nlp.ling.Word; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +public class NERString { + public static final String NO_CATEGORY = "O"; + + protected final List tokens; + private List groups; + + public NERString(List tokens) { + this.tokens = tokens; + this.createGroups(); + } + + private void createGroups() { + this.groups = new ArrayList<>(); + + String lastCategory = null; + List currGroupTokens = new ArrayList<>(); + for (NERToken nerToken : this.tokens) { + + if ((lastCategory != null) && (!nerToken.getCategory().equals(lastCategory))) { + // add + this.groups.add(new NERTokenGroup(currGroupTokens)); + currGroupTokens = new ArrayList<>(); + } + + currGroupTokens.add(nerToken); + lastCategory = nerToken.getCategory(); + } + + // add + this.groups.add(new NERTokenGroup(currGroupTokens)); + } + + public List getTokens() { + return tokens; + } + + public List getGroups() { + return groups; + } + + private List getWords(int fromIndex, int toIndex) { + return tokens.subList(fromIndex, toIndex).stream().map(t -> new Word(t.getText())).collect(Collectors.toList()); + } + + public List getWords() { + return getWords(0, tokens.size()); + } + + @Override + public String toString() { + return tokens.stream().map(NERToken::toString).collect(Collectors.joining("\n")); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParseException.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParseException.java new file mode 100644 index 0000000..a9783ef --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParseException.java @@ -0,0 +1,33 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERStringParseException + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +/** + * + */ +public class NERStringParseException extends Exception { + + public NERStringParseException(String msg) { + super(msg); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParser.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParser.java new file mode 100644 index 0000000..b63ae32 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParser.java @@ -0,0 +1,91 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERStringParser + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +import edu.stanford.nlp.ie.AbstractSequenceClassifier; +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.ner.tner.TNERString; +import org.lambda3.text.simplification.discourse.utils.ner.tner.TNERToken; +import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * + */ +public class NERStringParser { + + private static final AbstractSequenceClassifier NER_CLASSIFIER = CRFClassifier.getClassifierNoExceptions("edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"); + + public static NERString parse(String text) { + List tokens = new ArrayList<>(); + + String nerString = NER_CLASSIFIER.classifyToString(text); + String[] nerTokens = nerString.split(" "); + + int idx = 0; + for (String nerToken : nerTokens) { + int sep_idx = nerToken.lastIndexOf("/"); + + // create text + String txt = nerToken.substring(0, sep_idx); + String category = nerToken.substring(sep_idx + 1); + NERToken token = new NERToken(idx, txt, category); + tokens.add(token); + + ++idx; + } + + return new NERString(tokens); + } + + public static TNERString parse(Tree parseTree) throws NERStringParseException { + List tokens = new ArrayList<>(); + + List parseTreeLeafNumbers = ParseTreeExtractionUtils.getLeafNumbers(parseTree, parseTree); + String nerString = NER_CLASSIFIER.classifyToString(WordsUtils.wordsToString(parseTree.yieldWords())); + String[] nerTokens = nerString.split(" "); + + if (parseTreeLeafNumbers.size() != nerTokens.length) { + throw new NERStringParseException("Could not map NER string to parseTree"); + } + + int idx = 0; + for (String nerToken : nerTokens) { + int sep_idx = nerToken.lastIndexOf("/"); + + // create token + String text = nerToken.substring(0, sep_idx); + String category = nerToken.substring(sep_idx + 1); + TNERToken token = new TNERToken(idx, text, category, parseTree.getNodeNumber(parseTreeLeafNumbers.get(idx))); + tokens.add(token); + + ++idx; + } + + return new TNERString(tokens, parseTree); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERToken.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERToken.java new file mode 100644 index 0000000..2d02714 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERToken.java @@ -0,0 +1,61 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERToken + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +import edu.stanford.nlp.ling.Word; + +/** + * + */ +public class NERToken { + protected final int index; + protected final String text; + protected final String category; + + public NERToken(int index, String text, String category) { + this.index = index; + this.text = text; + this.category = category; + } + + public int getIndex() { + return index; + } + + public String getText() { + return text; + } + + public Word getWord() { + return new Word(text); + } + + public String getCategory() { + return category; + } + + @Override + public String toString() { + return "(" + index + ": " + category + ", '" + text + "')"; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERTokenGroup.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERTokenGroup.java new file mode 100644 index 0000000..6e41098 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERTokenGroup.java @@ -0,0 +1,68 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : NERTokenGroup + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner; + +import edu.stanford.nlp.ling.Word; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +class NERTokenGroup { + private final List tokens; + + public NERTokenGroup(List tokens) { + this.tokens = tokens; + } + + public int getFromTokenIndex() { + return tokens.get(0).index; + } + + public int getToTokenIndex() { + return tokens.get(tokens.size() - 1).index; + } + + public List getTokens() { + return tokens; + } + + private String getCategory() { + return tokens.get(0).getCategory(); + } + + public boolean isNamedEntity() { + return !getCategory().equals(NERString.NO_CATEGORY); + } + + public List getWords() { + return tokens.stream().map(t -> new Word(t.getText())).collect(Collectors.toList()); + } + + @Override + public String toString() { + return "[\n" + tokens.stream().map(t -> "\t" + t.toString()).collect(Collectors.joining("\n")) + "\n]"; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERString.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERString.java new file mode 100644 index 0000000..56df00f --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERString.java @@ -0,0 +1,46 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : TNERString + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner.tner; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.ner.NERString; + +import java.util.ArrayList; +import java.util.List; + +/** + * + */ +public class TNERString extends NERString { + private final Tree parseTree; + + public TNERString(List tokens, Tree parseTree) { + super(new ArrayList<>(tokens)); + this.parseTree = parseTree; + this.tokens.forEach(t -> ((TNERToken) t).setNerString(this)); + } + + public Tree getParseTree() { + return parseTree; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERToken.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERToken.java new file mode 100644 index 0000000..be56769 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERToken.java @@ -0,0 +1,69 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : TNERToken + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.ner.tner; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.ner.NERToken; + +/** + * + */ +public class TNERToken extends NERToken { + + private final Tree leafNode; + private TNERString nerString; + private Tree posNode; + + public TNERToken(int index, String token, String category, Tree leafNode) { + super(index, token, category); + this.nerString = null; + this.leafNode = leafNode; + this.posNode = null; // wait until nerString is set + } + + public void setNerString(TNERString nerString) { + this.nerString = nerString; + this.posNode = leafNode.parent(getParseTree()); + } + + private Tree getParseTree() { + return nerString.getParseTree(); + } + + public Tree getLeafNode() { + return leafNode; + } + + public Tree getPosNode() { + return posNode; + } + + private String getPOSTag() { + return posNode.value(); + } + + @Override + public String toString() { + return "(" + index + ": " + category + ", '" + text + "', " + getPOSTag() + ")"; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeException.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeException.java new file mode 100644 index 0000000..4c40e23 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeException.java @@ -0,0 +1,33 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ParseTreeException + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.parseTree; + +/** + * + */ +public class ParseTreeException extends Exception { + + public ParseTreeException(String text) { + super("Failed to parse text: \"" + text + "\""); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeExtractionUtils.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeExtractionUtils.java new file mode 100644 index 0000000..5151158 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeExtractionUtils.java @@ -0,0 +1,133 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ParseTreeExtractionUtils + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.parseTree; + +import edu.stanford.nlp.ling.Word; +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.IndexRange; +import org.lambda3.text.simplification.discourse.utils.ner.NERExtractionUtils; +import org.lambda3.text.simplification.discourse.utils.ner.NERString; + +import java.util.ArrayList; +import java.util.List; + +public class ParseTreeExtractionUtils { + + public static List getLeafNumbers(Tree anchorTree, Tree node) { + List res = new ArrayList<>(); + for (Tree leaf : node.getLeaves()) { + res.add(leaf.nodeNumber(anchorTree)); + } + return res; + } + + private static IndexRange getLeafIndexRange(Tree anchorTree, Tree node) { + int fromIdx = -1; + int toIdx = -1; + + List leafNumbers = getLeafNumbers(anchorTree, anchorTree); + List nodeLeafNumbers = getLeafNumbers(anchorTree, node); + int fromNumber = nodeLeafNumbers.get(0); + int toNumber = nodeLeafNumbers.get(nodeLeafNumbers.size() - 1); + + int idx = 0; + for (int leafNumber : leafNumbers) { + if (leafNumber == fromNumber) { + fromIdx = idx; + } + if (leafNumber == toNumber) { + toIdx = idx; + } + ++idx; + } + + if ((fromIdx >= 0) && (toIdx >= 0)) { + return new IndexRange(fromIdx, toIdx); + } else { + throw new IllegalArgumentException("node should be a subtree of anchorTree."); + } + } + + // returns True, if the model of node would not split/divide a NER group, else False + public static boolean isNERSafeExtraction(Tree anchorTree, NERString anchorNERString, Tree node) { + IndexRange leafIdxRange = getLeafIndexRange(anchorTree, node); + List nerIdxRanges = NERExtractionUtils.getNERIndexRanges(anchorNERString); + + for (IndexRange nerIdxRange : nerIdxRanges) { + if (((nerIdxRange.getFromIdx() < leafIdxRange.getFromIdx()) && (leafIdxRange.getFromIdx() <= nerIdxRange.getToIdx())) + || ((nerIdxRange.getFromIdx() <= leafIdxRange.getToIdx()) && (leafIdxRange.getToIdx() < nerIdxRange.getToIdx()))) { + return false; + } + } + + return true; + } + + + private static Tree getFirstLeaf(Tree tree) { + if (tree.isLeaf()) { + return tree; + } else { + return getFirstLeaf(tree.firstChild()); + } + } + + private static Tree getLastLeaf(Tree tree) { + if (tree.isLeaf()) { + return tree; + } else { + return getLastLeaf(tree.lastChild()); + } + } + + public static List getWordsInBetween(Tree anchorTree, Tree leftNode, Tree rightNode, boolean includeLeft, boolean includeRight) { + List res = new ArrayList<>(); + + int startLeafNumber = (includeLeft) ? getFirstLeaf(leftNode).nodeNumber(anchorTree) : getLastLeaf(leftNode).nodeNumber(anchorTree) + 1; + int endLeafNumber = (includeRight) ? getLastLeaf(rightNode).nodeNumber(anchorTree) : getFirstLeaf(rightNode).nodeNumber(anchorTree) - 1; + if ((startLeafNumber < 0) || (endLeafNumber < 0)) { + return res; + } + + for (int i = startLeafNumber; i <= endLeafNumber; ++i) { + Tree node = anchorTree.getNodeNumber(i); + if (node.isLeaf()) { + res.addAll(node.yieldWords()); + } + } + + return res; + } + + public static List getPrecedingWords(Tree anchorTree, Tree node, boolean include) { + return getWordsInBetween(anchorTree, getFirstLeaf(anchorTree), node, true, include); + } + + public static List getFollowingWords(Tree anchorTree, Tree node, boolean include) { + return getWordsInBetween(anchorTree, node, getLastLeaf(anchorTree), include, true); + } + + public static List getContainingWords(Tree node) { + return node.yieldWords(); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeParser.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeParser.java new file mode 100644 index 0000000..92f0736 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeParser.java @@ -0,0 +1,56 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ParseTreeParser + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.parseTree; + +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.parser.lexparser.LexicalizedParser; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.process.TokenizerFactory; +import edu.stanford.nlp.trees.Tree; + +import java.io.StringReader; +import java.util.List; + +/** + * + */ +public class ParseTreeParser { + + private static final TokenizerFactory TOKENIZER_FACTORY = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); + private static final LexicalizedParser LEX_PARSER = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); + + static { + LEX_PARSER.setOptionFlags("-outputFormat", "penn,typedDependenciesCollapsed", "-retainTmpSubcategories"); + } + + public static Tree parse(String text) throws ParseTreeException { + List rawWords = TOKENIZER_FACTORY.getTokenizer(new StringReader(text)).tokenize(); + Tree bestParse = LEX_PARSER.parseTree(rawWords); + if (bestParse == null) { + throw new ParseTreeException(text); + } + + return bestParse; + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeVisualizer.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeVisualizer.java new file mode 100644 index 0000000..c095ec6 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeVisualizer.java @@ -0,0 +1,68 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ParseTreeVisualizer + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.parseTree; + +import edu.stanford.nlp.trees.Tree; +import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * + */ +public class ParseTreeVisualizer { + + public static String prettyPrint(Tree parseTree) { + MyNode node = new MyNode(parseTree, parseTree); + return PrettyTreePrinter.prettyPrint(node, false); + } + + private static class MyNode implements PrettyTreePrinter.Node { + private final List children; + private final String caption; + private final int nr; + + public MyNode(Tree parseNode, Tree anchor) { + this.caption = parseNode.value(); + this.children = new ArrayList<>(); + for (Tree childNode : parseNode.getChildrenAsList()) { + this.children.add(new MyNode(childNode, anchor)); + } + this.nr = parseNode.nodeNumber(anchor); + } + + @Override + public List getPTPCaption() { + return Arrays.asList(caption, "#" + nr); + } + + @Override + public List getPTPEdges() { + return children.stream().map(c -> new PrettyTreePrinter.DefaultEdge("", c, true)).collect(Collectors.toList()); + } + + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/sentences/SentencesUtils.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/sentences/SentencesUtils.java new file mode 100644 index 0000000..d51e1b2 --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/sentences/SentencesUtils.java @@ -0,0 +1,56 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : SentencesUtils + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.sentences; + +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.SentenceUtils; +import edu.stanford.nlp.process.DocumentPreprocessor; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; + +/** + * + */ +public class SentencesUtils { + + private static List splitIntoSentences(Reader reader) { + List res = new ArrayList<>(); + + DocumentPreprocessor dp = new DocumentPreprocessor(reader); + for (List sentence : dp) { + res.add(SentenceUtils.listToString(sentence)); + } + + return res; + } + + public static List splitIntoSentences(String text) { + return splitIntoSentences(new StringReader(text)); + } + + public static List splitIntoSentencesFromFile(File file) throws FileNotFoundException { + return splitIntoSentences(new BufferedReader(new FileReader(file))); + } +} diff --git a/src/main/java/org/lambda3/text/simplification/discourse/utils/words/WordsUtils.java b/src/main/java/org/lambda3/text/simplification/discourse/utils/words/WordsUtils.java new file mode 100644 index 0000000..1c8f53b --- /dev/null +++ b/src/main/java/org/lambda3/text/simplification/discourse/utils/words/WordsUtils.java @@ -0,0 +1,91 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : WordsUtils + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils.words; + +import edu.stanford.nlp.ling.SentenceUtils; +import edu.stanford.nlp.ling.Word; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +/** + * + */ +public class WordsUtils { + + public static String wordsToString(List words) { + return SentenceUtils.listToString(words); + } + + public static String wordsToProperSentenceString(List words) { + return wordsToString(wordsToProperSentence(words)); + } + + private static Word capitalizeWord(Word word) { + String s = word.value(); + if (s.length() > 0) { + s = s.substring(0, 1).toUpperCase() + s.substring(1); + } + + return new Word(s); + } + + public static Word lowercaseWord(Word word) { + return new Word(word.value().toLowerCase()); + } + + private static List wordsToProperSentence(List words) { + List res = new ArrayList<>(); + res.addAll(words); + + // trim '.' and ',' at beginning and the end and remove multiple, consecutive occurrences + for (String c : Arrays.asList(".", ",")) { + Word prev = null; + Iterator it = res.iterator(); + while (it.hasNext()) { + Word word = it.next(); + if (word.value().equals(c)) { + if (prev == null || prev.value().equals(word.value())) { + it.remove(); + } + } + prev = word; + } + if ((!res.isEmpty()) && (res.get(res.size() - 1).value().equals(c))) { + res.remove(res.size() - 1); + } + } + + // add a '.' at the end + res.add(new Word(".")); + + // capitalize first word + if (!res.isEmpty()) { + res.set(0, capitalizeWord(res.get(0))); + } + + return res; + } +} diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..4e82fc8 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,36 @@ + + + + + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file diff --git a/src/test/java/org/lambda3/text/simplification/discourse/processing/ProcessorTest.java b/src/test/java/org/lambda3/text/simplification/discourse/processing/ProcessorTest.java new file mode 100644 index 0000000..912c792 --- /dev/null +++ b/src/test/java/org/lambda3/text/simplification/discourse/processing/ProcessorTest.java @@ -0,0 +1,69 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ProcessorTest + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.processing; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.DCore; +import org.lambda3.text.simplification.discourse.sentence_simplification.element.SContext; +import org.lambda3.text.simplification.discourse.sentence_simplification.relation.DCoreRelation; +import org.lambda3.text.simplification.discourse.tree.Relation; + +import java.util.Arrays; +import java.util.List; + +/** + * + */ +class ProcessorTest { + + @Test + void processSingleSentence() { + + String text = "Bernhard is working on a project for PACE but he also works for MARIO."; + + DCore first = new DCore("Bernhard is working on a project for PACE .", 0, "Bernhard is working on a project for PACE ."); + DCore second = new DCore("he also works .", 0, "He also works for MARIO ."); + + first.addDCoreRelation(new DCoreRelation( + Relation.CONTRAST, second + )); + + second.addDCoreRelation(new DCoreRelation( + Relation.CONTRAST, first + )); + second.addSContext(new SContext( + "This is for MARIO .", 0, Relation.UNKNOWN_SENT_SIM + )); + + final List expected = Arrays.asList(first, second); + + + Processor p = new Processor(); + final List actual = p.process(text, Processor.ProcessingType.SEPARATE); + + Assertions.assertIterableEquals(expected, actual); + + } + +} \ No newline at end of file diff --git a/src/test/java/org/lambda3/text/simplification/discourse/utils/ExtractionUtilsTest.java b/src/test/java/org/lambda3/text/simplification/discourse/utils/ExtractionUtilsTest.java new file mode 100644 index 0000000..e6588b0 --- /dev/null +++ b/src/test/java/org/lambda3/text/simplification/discourse/utils/ExtractionUtilsTest.java @@ -0,0 +1,57 @@ +/* + * ==========================License-Start============================= + * DiscourseSimplification : ExtractionUtilsTest + * + * Copyright © 2017 Lambda³ + * + * GNU General Public License 3 + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + * ==========================License-End============================== + */ + +package org.lambda3.text.simplification.discourse.utils; + +import edu.stanford.nlp.ling.Word; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; + +import java.util.Arrays; +import java.util.List; + +/** + * + */ +class ExtractionUtilsTest { + + @Test + void wordsToProperSentence() throws Exception { + List words = Arrays.asList( + new Word("."), + new Word("."), + new Word("hello"), + new Word(","), + new Word(","), + new Word("this"), + new Word("is"), + new Word("a"), + new Word("test"), + new Word("."), + new Word(".") + ); + + String sentence = WordsUtils.wordsToProperSentenceString(words); + Assertions.assertEquals("Hello , this is a test .", sentence); + } +} \ No newline at end of file