From cb9c50dd8ef66288ae2f3ef36e361ad9a7b62a03 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Mon, 12 Sep 2022 13:52:51 -0500 Subject: [PATCH] Initial commit --- .gitignore | 3 + LICENSE.txt | 339 ++++++++ Makefile | 23 + README.md | 38 + build/regexp.go-part1 | 3 + build/regexp.go-part2 | 1 + build/regexp.go-part3 | 2 + build/top-level-domains.txt | 1490 +++++++++++++++++++++++++++++++++++ go.mod | 3 + main.go | 119 +++ regexp.go | 4 + 11 files changed, 2025 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE.txt create mode 100644 Makefile create mode 100644 README.md create mode 100644 build/regexp.go-part1 create mode 100644 build/regexp.go-part2 create mode 100644 build/regexp.go-part3 create mode 100644 build/top-level-domains.txt create mode 100644 go.mod create mode 100644 main.go create mode 100644 regexp.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6a2fbf3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +parcels +go.sum + diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7cf4d55 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +URL_ICANN=https://data.iana.org/TLD/tlds-alpha-by-domain.txt + +build/top-level-domains.txt: + curl -o build/top-level-domains.txt $(URL_ICANN) + +regexp.go: build/top-level-domains.txt + tail +2 build/top-level-domains.txt \ + | perl -pe 'chomp if eof' \ + | tr '[:upper:]' '[:lower:]' \ + | tr '\n' '|' > build/regexp.go-part2 + cat <(perl -pe 'chomp if eof' build/regexp.go-part1) \ + <(perl -pe 'chomp if eof' build/regexp.go-part2) \ + build/regexp.go-part3 \ + > regexp.go + +.PHONY: clean +clean: + rm parcels regexp.go go.sum + +.PHONY: build +build: regexp.go + go build + diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3097d6 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# parcel + +A tool to *parse URLs*. If you say that fast enough, it sounds like *parcels*. + + +## Usage + +`cat EMAIL | parcel` + +This will... + + 1. re-prints `EMAIL` with URLs replaced by indices + 2. print a postscript of indexed URLs + +`cat EMAIL | parcel -n 0` + +This will... + + 1. try to find the first URL in `EMAIL` and re-print just that URL + 2. otherwise print nothing + + +## License + +The URL regular expression is adapted from that used in +[urlscan](https://github.com/firecat53/urlscan), which is licensed under GPL v2. +Credit for this portion of the code should go to: + + + Scott Hansen \<[firecat4153@gmail.com](mailto:firecat4153@gmail.com)\> + (Author and Maintainer) + + Maxime Chatelle \<[xakz@rxsoft.eu](mailto:xakz@rxsoft.eu)\> + (Debian Maintainer) + + Daniel Burrows \<[dburrows@debian.org](mailto:dburrows@debian.org)\> + (Original Author) + +I personally consider this project to be derivative, therefore I am keeping the +GPL v2 license of the urlscan project. + diff --git a/build/regexp.go-part1 b/build/regexp.go-part1 new file mode 100644 index 0000000..d19119e --- /dev/null +++ b/build/regexp.go-part1 @@ -0,0 +1,3 @@ +package main + +var UrlPattern = `:?((?:(?:https?|file|ftps?)://[{}()@\w/\\\-%?!&.=:;+,#~]*[{}(@\w/\-%&=+#$])|(?:[\w\-%]+(?:\.[\w\-%]+)*\.(?: diff --git a/build/regexp.go-part2 b/build/regexp.go-part2 new file mode 100644 index 0000000..b7634e9 --- /dev/null +++ b/build/regexp.go-part2 @@ -0,0 +1 @@ +aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afl|africa|ag|agakhan|agency|ai|aig|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|amazon|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|black|blackfriday|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|casa|case|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|christmas|chrome|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cpa|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dog|domains|dot|download|drive|dtv|dubai|dunlop|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epson|equipment|er|ericsson|erni|es|esq|estate|et|etisalat|eu|eurovision|eus|events|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gay|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|ikano|il|im|imamat|imdb|immo|immobilien|in|inc|industries|infiniti|info|ing|ink|institute|insurance|insure|int|international|intuit|investments|io|ipiranga|iq|ir|irish|is|ismaili|ist|istanbul|it|itau|itv|jaguar|java|jcb|je|jeep|jetzt|jewelry|jio|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|lamborghini|lamer|lancaster|lancia|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|llp|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|moda|moe|moi|mom|monash|money|monster|mormon|mortgage|moscow|moto|motorcycles|mov|movie|mp|mq|mr|ms|msd|mt|mtn|mtr|mu|museum|music|mutual|mv|mw|mx|my|mz|na|nab|nagoya|name|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|pa|page|panasonic|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|racing|radio|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|ril|rio|rip|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|spa|space|sport|spot|sr|srl|ss|st|stada|staples|star|statebank|statefarm|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiss|sx|sy|sydney|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3pxu8k|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4dbrk0ce|xn--4gbrim|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cckwcxetd|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq480n2rg|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbah1a3hjkrd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbbh1a|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgbcpq6gpa1a|xn--mgberp4a5d4ar|xn--mgbgu82a|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--otu796d|xn--p1acf|xn--p1ai|xn--pgbs0dh|xn--pssy2u|xn--q7ce6a|xn--q9jyb4c|xn--qcka1pmc|xn--qxa6a|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zm|zone|zuerich|zw \ No newline at end of file diff --git a/build/regexp.go-part3 b/build/regexp.go-part3 new file mode 100644 index 0000000..6179067 --- /dev/null +++ b/build/regexp.go-part3 @@ -0,0 +1,2 @@ +)\b))` + diff --git a/build/top-level-domains.txt b/build/top-level-domains.txt new file mode 100644 index 0000000..e68b031 --- /dev/null +++ b/build/top-level-domains.txt @@ -0,0 +1,1490 @@ +# Version 2021121700, Last Updated Fri Dec 17 07:07:01 2021 UTC +AAA +AARP +ABARTH +ABB +ABBOTT +ABBVIE +ABC +ABLE +ABOGADO +ABUDHABI +AC +ACADEMY +ACCENTURE +ACCOUNTANT +ACCOUNTANTS +ACO +ACTOR +AD +ADAC +ADS +ADULT +AE +AEG +AERO +AETNA +AF +AFL +AFRICA +AG +AGAKHAN +AGENCY +AI +AIG +AIRBUS +AIRFORCE +AIRTEL +AKDN +AL +ALFAROMEO +ALIBABA +ALIPAY +ALLFINANZ +ALLSTATE +ALLY +ALSACE +ALSTOM +AM +AMAZON +AMERICANEXPRESS +AMERICANFAMILY +AMEX +AMFAM +AMICA +AMSTERDAM +ANALYTICS +ANDROID +ANQUAN +ANZ +AO +AOL +APARTMENTS +APP +APPLE +AQ +AQUARELLE +AR +ARAB +ARAMCO +ARCHI +ARMY +ARPA +ART +ARTE +AS +ASDA +ASIA +ASSOCIATES +AT +ATHLETA +ATTORNEY +AU +AUCTION +AUDI +AUDIBLE +AUDIO +AUSPOST +AUTHOR +AUTO +AUTOS +AVIANCA +AW +AWS +AX +AXA +AZ +AZURE +BA +BABY +BAIDU +BANAMEX +BANANAREPUBLIC +BAND +BANK +BAR +BARCELONA +BARCLAYCARD +BARCLAYS +BAREFOOT +BARGAINS +BASEBALL +BASKETBALL +BAUHAUS +BAYERN +BB +BBC +BBT +BBVA +BCG +BCN +BD +BE +BEATS +BEAUTY +BEER +BENTLEY +BERLIN +BEST +BESTBUY +BET +BF +BG +BH +BHARTI +BI +BIBLE +BID +BIKE +BING +BINGO +BIO +BIZ +BJ +BLACK +BLACKFRIDAY +BLOCKBUSTER +BLOG +BLOOMBERG +BLUE +BM +BMS +BMW +BN +BNPPARIBAS +BO +BOATS +BOEHRINGER +BOFA +BOM +BOND +BOO +BOOK +BOOKING +BOSCH +BOSTIK +BOSTON +BOT +BOUTIQUE +BOX +BR +BRADESCO +BRIDGESTONE +BROADWAY +BROKER +BROTHER +BRUSSELS +BS +BT +BUDAPEST +BUGATTI +BUILD +BUILDERS +BUSINESS +BUY +BUZZ +BV +BW +BY +BZ +BZH +CA +CAB +CAFE +CAL +CALL +CALVINKLEIN +CAM +CAMERA +CAMP +CANCERRESEARCH +CANON +CAPETOWN +CAPITAL +CAPITALONE +CAR +CARAVAN +CARDS +CARE +CAREER +CAREERS +CARS +CASA +CASE +CASH +CASINO +CAT +CATERING +CATHOLIC +CBA +CBN +CBRE +CBS +CC +CD +CENTER +CEO +CERN +CF +CFA +CFD +CG +CH +CHANEL +CHANNEL +CHARITY +CHASE +CHAT +CHEAP +CHINTAI +CHRISTMAS +CHROME +CHURCH +CI +CIPRIANI +CIRCLE +CISCO +CITADEL +CITI +CITIC +CITY +CITYEATS +CK +CL +CLAIMS +CLEANING +CLICK +CLINIC +CLINIQUE +CLOTHING +CLOUD +CLUB +CLUBMED +CM +CN +CO +COACH +CODES +COFFEE +COLLEGE +COLOGNE +COM +COMCAST +COMMBANK +COMMUNITY +COMPANY +COMPARE +COMPUTER +COMSEC +CONDOS +CONSTRUCTION +CONSULTING +CONTACT +CONTRACTORS +COOKING +COOKINGCHANNEL +COOL +COOP +CORSICA +COUNTRY +COUPON +COUPONS +COURSES +CPA +CR +CREDIT +CREDITCARD +CREDITUNION +CRICKET +CROWN +CRS +CRUISE +CRUISES +CSC +CU +CUISINELLA +CV +CW +CX +CY +CYMRU +CYOU +CZ +DABUR +DAD +DANCE +DATA +DATE +DATING +DATSUN +DAY +DCLK +DDS +DE +DEAL +DEALER +DEALS +DEGREE +DELIVERY +DELL +DELOITTE +DELTA +DEMOCRAT +DENTAL +DENTIST +DESI +DESIGN +DEV +DHL +DIAMONDS +DIET +DIGITAL +DIRECT +DIRECTORY +DISCOUNT +DISCOVER +DISH +DIY +DJ +DK +DM +DNP +DO +DOCS +DOCTOR +DOG +DOMAINS +DOT +DOWNLOAD +DRIVE +DTV +DUBAI +DUNLOP +DUPONT +DURBAN +DVAG +DVR +DZ +EARTH +EAT +EC +ECO +EDEKA +EDU +EDUCATION +EE +EG +EMAIL +EMERCK +ENERGY +ENGINEER +ENGINEERING +ENTERPRISES +EPSON +EQUIPMENT +ER +ERICSSON +ERNI +ES +ESQ +ESTATE +ET +ETISALAT +EU +EUROVISION +EUS +EVENTS +EXCHANGE +EXPERT +EXPOSED +EXPRESS +EXTRASPACE +FAGE +FAIL +FAIRWINDS +FAITH +FAMILY +FAN +FANS +FARM +FARMERS +FASHION +FAST +FEDEX +FEEDBACK +FERRARI +FERRERO +FI +FIAT +FIDELITY +FIDO +FILM +FINAL +FINANCE +FINANCIAL +FIRE +FIRESTONE +FIRMDALE +FISH +FISHING +FIT +FITNESS +FJ +FK +FLICKR +FLIGHTS +FLIR +FLORIST +FLOWERS +FLY +FM +FO +FOO +FOOD +FOODNETWORK +FOOTBALL +FORD +FOREX +FORSALE +FORUM +FOUNDATION +FOX +FR +FREE +FRESENIUS +FRL +FROGANS +FRONTDOOR +FRONTIER +FTR +FUJITSU +FUN +FUND +FURNITURE +FUTBOL +FYI +GA +GAL +GALLERY +GALLO +GALLUP +GAME +GAMES +GAP +GARDEN +GAY +GB +GBIZ +GD +GDN +GE +GEA +GENT +GENTING +GEORGE +GF +GG +GGEE +GH +GI +GIFT +GIFTS +GIVES +GIVING +GL +GLASS +GLE +GLOBAL +GLOBO +GM +GMAIL +GMBH +GMO +GMX +GN +GODADDY +GOLD +GOLDPOINT +GOLF +GOO +GOODYEAR +GOOG +GOOGLE +GOP +GOT +GOV +GP +GQ +GR +GRAINGER +GRAPHICS +GRATIS +GREEN +GRIPE +GROCERY +GROUP +GS +GT +GU +GUARDIAN +GUCCI +GUGE +GUIDE +GUITARS +GURU +GW +GY +HAIR +HAMBURG +HANGOUT +HAUS +HBO +HDFC +HDFCBANK +HEALTH +HEALTHCARE +HELP +HELSINKI +HERE +HERMES +HGTV +HIPHOP +HISAMITSU +HITACHI +HIV +HK +HKT +HM +HN +HOCKEY +HOLDINGS +HOLIDAY +HOMEDEPOT +HOMEGOODS +HOMES +HOMESENSE +HONDA +HORSE +HOSPITAL +HOST +HOSTING +HOT +HOTELES +HOTELS +HOTMAIL +HOUSE +HOW +HR +HSBC +HT +HU +HUGHES +HYATT +HYUNDAI +IBM +ICBC +ICE +ICU +ID +IE +IEEE +IFM +IKANO +IL +IM +IMAMAT +IMDB +IMMO +IMMOBILIEN +IN +INC +INDUSTRIES +INFINITI +INFO +ING +INK +INSTITUTE +INSURANCE +INSURE +INT +INTERNATIONAL +INTUIT +INVESTMENTS +IO +IPIRANGA +IQ +IR +IRISH +IS +ISMAILI +IST +ISTANBUL +IT +ITAU +ITV +JAGUAR +JAVA +JCB +JE +JEEP +JETZT +JEWELRY +JIO +JLL +JM +JMP +JNJ +JO +JOBS +JOBURG +JOT +JOY +JP +JPMORGAN +JPRS +JUEGOS +JUNIPER +KAUFEN +KDDI +KE +KERRYHOTELS +KERRYLOGISTICS +KERRYPROPERTIES +KFH +KG +KH +KI +KIA +KIM +KINDER +KINDLE +KITCHEN +KIWI +KM +KN +KOELN +KOMATSU +KOSHER +KP +KPMG +KPN +KR +KRD +KRED +KUOKGROUP +KW +KY +KYOTO +KZ +LA +LACAIXA +LAMBORGHINI +LAMER +LANCASTER +LANCIA +LAND +LANDROVER +LANXESS +LASALLE +LAT +LATINO +LATROBE +LAW +LAWYER +LB +LC +LDS +LEASE +LECLERC +LEFRAK +LEGAL +LEGO +LEXUS +LGBT +LI +LIDL +LIFE +LIFEINSURANCE +LIFESTYLE +LIGHTING +LIKE +LILLY +LIMITED +LIMO +LINCOLN +LINDE +LINK +LIPSY +LIVE +LIVING +LIXIL +LK +LLC +LLP +LOAN +LOANS +LOCKER +LOCUS +LOFT +LOL +LONDON +LOTTE +LOTTO +LOVE +LPL +LPLFINANCIAL +LR +LS +LT +LTD +LTDA +LU +LUNDBECK +LUXE +LUXURY +LV +LY +MA +MACYS +MADRID +MAIF +MAISON +MAKEUP +MAN +MANAGEMENT +MANGO +MAP +MARKET +MARKETING +MARKETS +MARRIOTT +MARSHALLS +MASERATI +MATTEL +MBA +MC +MCKINSEY +MD +ME +MED +MEDIA +MEET +MELBOURNE +MEME +MEMORIAL +MEN +MENU +MERCKMSD +MG +MH +MIAMI +MICROSOFT +MIL +MINI +MINT +MIT +MITSUBISHI +MK +ML +MLB +MLS +MM +MMA +MN +MO +MOBI +MOBILE +MODA +MOE +MOI +MOM +MONASH +MONEY +MONSTER +MORMON +MORTGAGE +MOSCOW +MOTO +MOTORCYCLES +MOV +MOVIE +MP +MQ +MR +MS +MSD +MT +MTN +MTR +MU +MUSEUM +MUSIC +MUTUAL +MV +MW +MX +MY +MZ +NA +NAB +NAGOYA +NAME +NATURA +NAVY +NBA +NC +NE +NEC +NET +NETBANK +NETFLIX +NETWORK +NEUSTAR +NEW +NEWS +NEXT +NEXTDIRECT +NEXUS +NF +NFL +NG +NGO +NHK +NI +NICO +NIKE +NIKON +NINJA +NISSAN +NISSAY +NL +NO +NOKIA +NORTHWESTERNMUTUAL +NORTON +NOW +NOWRUZ +NOWTV +NP +NR +NRA +NRW +NTT +NU +NYC +NZ +OBI +OBSERVER +OFFICE +OKINAWA +OLAYAN +OLAYANGROUP +OLDNAVY +OLLO +OM +OMEGA +ONE +ONG +ONL +ONLINE +OOO +OPEN +ORACLE +ORANGE +ORG +ORGANIC +ORIGINS +OSAKA +OTSUKA +OTT +OVH +PA +PAGE +PANASONIC +PARIS +PARS +PARTNERS +PARTS +PARTY +PASSAGENS +PAY +PCCW +PE +PET +PF +PFIZER +PG +PH +PHARMACY +PHD +PHILIPS +PHONE +PHOTO +PHOTOGRAPHY +PHOTOS +PHYSIO +PICS +PICTET +PICTURES +PID +PIN +PING +PINK +PIONEER +PIZZA +PK +PL +PLACE +PLAY +PLAYSTATION +PLUMBING +PLUS +PM +PN +PNC +POHL +POKER +POLITIE +PORN +POST +PR +PRAMERICA +PRAXI +PRESS +PRIME +PRO +PROD +PRODUCTIONS +PROF +PROGRESSIVE +PROMO +PROPERTIES +PROPERTY +PROTECTION +PRU +PRUDENTIAL +PS +PT +PUB +PW +PWC +PY +QA +QPON +QUEBEC +QUEST +RACING +RADIO +RE +READ +REALESTATE +REALTOR +REALTY +RECIPES +RED +REDSTONE +REDUMBRELLA +REHAB +REISE +REISEN +REIT +RELIANCE +REN +RENT +RENTALS +REPAIR +REPORT +REPUBLICAN +REST +RESTAURANT +REVIEW +REVIEWS +REXROTH +RICH +RICHARDLI +RICOH +RIL +RIO +RIP +RO +ROCHER +ROCKS +RODEO +ROGERS +ROOM +RS +RSVP +RU +RUGBY +RUHR +RUN +RW +RWE +RYUKYU +SA +SAARLAND +SAFE +SAFETY +SAKURA +SALE +SALON +SAMSCLUB +SAMSUNG +SANDVIK +SANDVIKCOROMANT +SANOFI +SAP +SARL +SAS +SAVE +SAXO +SB +SBI +SBS +SC +SCA +SCB +SCHAEFFLER +SCHMIDT +SCHOLARSHIPS +SCHOOL +SCHULE +SCHWARZ +SCIENCE +SCOT +SD +SE +SEARCH +SEAT +SECURE +SECURITY +SEEK +SELECT +SENER +SERVICES +SES +SEVEN +SEW +SEX +SEXY +SFR +SG +SH +SHANGRILA +SHARP +SHAW +SHELL +SHIA +SHIKSHA +SHOES +SHOP +SHOPPING +SHOUJI +SHOW +SHOWTIME +SI +SILK +SINA +SINGLES +SITE +SJ +SK +SKI +SKIN +SKY +SKYPE +SL +SLING +SM +SMART +SMILE +SN +SNCF +SO +SOCCER +SOCIAL +SOFTBANK +SOFTWARE +SOHU +SOLAR +SOLUTIONS +SONG +SONY +SOY +SPA +SPACE +SPORT +SPOT +SR +SRL +SS +ST +STADA +STAPLES +STAR +STATEBANK +STATEFARM +STC +STCGROUP +STOCKHOLM +STORAGE +STORE +STREAM +STUDIO +STUDY +STYLE +SU +SUCKS +SUPPLIES +SUPPLY +SUPPORT +SURF +SURGERY +SUZUKI +SV +SWATCH +SWISS +SX +SY +SYDNEY +SYSTEMS +SZ +TAB +TAIPEI +TALK +TAOBAO +TARGET +TATAMOTORS +TATAR +TATTOO +TAX +TAXI +TC +TCI +TD +TDK +TEAM +TECH +TECHNOLOGY +TEL +TEMASEK +TENNIS +TEVA +TF +TG +TH +THD +THEATER +THEATRE +TIAA +TICKETS +TIENDA +TIFFANY +TIPS +TIRES +TIROL +TJ +TJMAXX +TJX +TK +TKMAXX +TL +TM +TMALL +TN +TO +TODAY +TOKYO +TOOLS +TOP +TORAY +TOSHIBA +TOTAL +TOURS +TOWN +TOYOTA +TOYS +TR +TRADE +TRADING +TRAINING +TRAVEL +TRAVELCHANNEL +TRAVELERS +TRAVELERSINSURANCE +TRUST +TRV +TT +TUBE +TUI +TUNES +TUSHU +TV +TVS +TW +TZ +UA +UBANK +UBS +UG +UK +UNICOM +UNIVERSITY +UNO +UOL +UPS +US +UY +UZ +VA +VACATIONS +VANA +VANGUARD +VC +VE +VEGAS +VENTURES +VERISIGN +VERSICHERUNG +VET +VG +VI +VIAJES +VIDEO +VIG +VIKING +VILLAS +VIN +VIP +VIRGIN +VISA +VISION +VIVA +VIVO +VLAANDEREN +VN +VODKA +VOLKSWAGEN +VOLVO +VOTE +VOTING +VOTO +VOYAGE +VU +VUELOS +WALES +WALMART +WALTER +WANG +WANGGOU +WATCH +WATCHES +WEATHER +WEATHERCHANNEL +WEBCAM +WEBER +WEBSITE +WED +WEDDING +WEIBO +WEIR +WF +WHOSWHO +WIEN +WIKI +WILLIAMHILL +WIN +WINDOWS +WINE +WINNERS +WME +WOLTERSKLUWER +WOODSIDE +WORK +WORKS +WORLD +WOW +WS +WTC +WTF +XBOX +XEROX +XFINITY +XIHUAN +XIN +XN--11B4C3D +XN--1CK2E1B +XN--1QQW23A +XN--2SCRJ9C +XN--30RR7Y +XN--3BST00M +XN--3DS443G +XN--3E0B707E +XN--3HCRJ9C +XN--3PXU8K +XN--42C2D9A +XN--45BR5CYL +XN--45BRJ9C +XN--45Q11C +XN--4DBRK0CE +XN--4GBRIM +XN--54B7FTA0CC +XN--55QW42G +XN--55QX5D +XN--5SU34J936BGSG +XN--5TZM5G +XN--6FRZ82G +XN--6QQ986B3XL +XN--80ADXHKS +XN--80AO21A +XN--80AQECDR1A +XN--80ASEHDB +XN--80ASWG +XN--8Y0A063A +XN--90A3AC +XN--90AE +XN--90AIS +XN--9DBQ2A +XN--9ET52U +XN--9KRT00A +XN--B4W605FERD +XN--BCK1B9A5DRE4C +XN--C1AVG +XN--C2BR7G +XN--CCK2B3B +XN--CCKWCXETD +XN--CG4BKI +XN--CLCHC0EA0B2G2A9GCD +XN--CZR694B +XN--CZRS0T +XN--CZRU2D +XN--D1ACJ3B +XN--D1ALF +XN--E1A4C +XN--ECKVDTC9D +XN--EFVY88H +XN--FCT429K +XN--FHBEI +XN--FIQ228C5HS +XN--FIQ64B +XN--FIQS8S +XN--FIQZ9S +XN--FJQ720A +XN--FLW351E +XN--FPCRJ9C3D +XN--FZC2C9E2C +XN--FZYS8D69UVGM +XN--G2XX48C +XN--GCKR3F0F +XN--GECRJ9C +XN--GK3AT1E +XN--H2BREG3EVE +XN--H2BRJ9C +XN--H2BRJ9C8C +XN--HXT814E +XN--I1B6B1A6A2E +XN--IMR513N +XN--IO0A7I +XN--J1AEF +XN--J1AMH +XN--J6W193G +XN--JLQ480N2RG +XN--JLQ61U9W7B +XN--JVR189M +XN--KCRX77D1X4A +XN--KPRW13D +XN--KPRY57D +XN--KPUT3I +XN--L1ACC +XN--LGBBAT1AD8J +XN--MGB9AWBF +XN--MGBA3A3EJT +XN--MGBA3A4F16A +XN--MGBA7C0BBN0A +XN--MGBAAKC7DVF +XN--MGBAAM7A8H +XN--MGBAB2BD +XN--MGBAH1A3HJKRD +XN--MGBAI9AZGQP6J +XN--MGBAYH7GPA +XN--MGBBH1A +XN--MGBBH1A71E +XN--MGBC0A9AZCG +XN--MGBCA7DZDO +XN--MGBCPQ6GPA1A +XN--MGBERP4A5D4AR +XN--MGBGU82A +XN--MGBI4ECEXP +XN--MGBPL2FH +XN--MGBT3DHD +XN--MGBTX2B +XN--MGBX4CD0AB +XN--MIX891F +XN--MK1BU44C +XN--MXTQ1M +XN--NGBC5AZD +XN--NGBE9E0A +XN--NGBRX +XN--NODE +XN--NQV7F +XN--NQV7FS00EMA +XN--NYQY26A +XN--O3CW4H +XN--OGBPF8FL +XN--OTU796D +XN--P1ACF +XN--P1AI +XN--PGBS0DH +XN--PSSY2U +XN--Q7CE6A +XN--Q9JYB4C +XN--QCKA1PMC +XN--QXA6A +XN--QXAM +XN--RHQV96G +XN--ROVU88B +XN--RVC1E0AM3E +XN--S9BRJ9C +XN--SES554G +XN--T60B56A +XN--TCKWE +XN--TIQ49XQYJ +XN--UNUP4Y +XN--VERMGENSBERATER-CTB +XN--VERMGENSBERATUNG-PWB +XN--VHQUV +XN--VUQ861B +XN--W4R85EL8FHU5DNRA +XN--W4RS40L +XN--WGBH1C +XN--WGBL6A +XN--XHQ521B +XN--XKC2AL3HYE2A +XN--XKC2DL3A5EE0H +XN--Y9A3AQ +XN--YFRO4I67O +XN--YGBI2AMMX +XN--ZFR164B +XXX +XYZ +YACHTS +YAHOO +YAMAXUN +YANDEX +YE +YODOBASHI +YOGA +YOKOHAMA +YOU +YOUTUBE +YT +YUN +ZA +ZAPPOS +ZARA +ZERO +ZIP +ZM +ZONE +ZUERICH +ZW diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f1e32d7 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module git.dominic-ricottone.com/parcels + +go 1.17 diff --git a/main.go b/main.go new file mode 100644 index 0000000..be03f9c --- /dev/null +++ b/main.go @@ -0,0 +1,119 @@ +package main + +import ( + "fmt" + "os" + "io" + "bufio" + "regexp" + "flag" +) + +func Replacement(i int) string { + return fmt.Sprintf("[%d]", i) +} + +func find_in_stream(reader io.Reader, target int) { + // Create scanner from reader + input := bufio.NewScanner(reader) + + // Initialize state + re := regexp.MustCompile(UrlPattern) + count := 0 + + // Parse and print + for input.Scan() { + line := input.Text() + line_indices := re.FindAllStringIndex(line, -1) + count_after := count + len(line_indices) + + if target < count_after { + beg, end := line_indices[target-count][0], line_indices[target-count][1] + fmt.Println(line[beg:end]) + break + } + + count = count_after + } + + // Check for scanner errors + if err := input.Err(); err != nil { + fmt.Printf("internal error - %v\n", err) + os.Exit(1) + } +} + +func parse_stream(reader io.Reader) { + // Create scanner from reader + input := bufio.NewScanner(reader) + + // Initialize state + re := regexp.MustCompile(UrlPattern) + var parcels [999]string //assuming that never need 1000+ URLs + offset := 0 + + // Parse, modify, and print + for input.Scan() { + line := input.Text() + + line_indices := re.FindAllStringIndex(line, -1) + for i := len(line_indices)-1; i >= 0; i-- { + beg, end := line_indices[i][0], line_indices[i][1] + parcels[offset+i] = line[beg:end] + line = line[:beg] + Replacement(offset+i) + line[end:] + } + + fmt.Println(line) + + offset = offset + len(line_indices) + } + + // Print postscript + fmt.Printf("\nURLs:\n") + for index, url := range parcels { + if url == "" { + break + } + fmt.Printf("[%d] %s\n", index, url) + } + + // Check for scanner errors + if err := input.Err(); err != nil { + fmt.Printf("internal error - %v\n", err) + os.Exit(1) + } +} + +func parse_file(filename string) { + // Check file + file, err := os.Open(filename) + if err != nil { + fmt.Printf("cannot read file '%s'\n", filename) + os.Exit(1) + } + defer file.Close() + + // Parse + parse_stream(file) +} + +func main() { + // Check STDIN + _, err := os.Stdin.Stat() + if err != nil { + fmt.Println("cannot read input") + os.Exit(1) + } + + // Look for arguments + var index = flag.Int("n", -1, "print Nth URL") + flag.Parse() + + // Parse + if *index < 0 { + parse_stream(os.Stdin) + } else { + find_in_stream(os.Stdin, *index) + } +} + diff --git a/regexp.go b/regexp.go new file mode 100644 index 0000000..a74a298 --- /dev/null +++ b/regexp.go @@ -0,0 +1,4 @@ +package main + +var UrlPattern = `:?((?:(?:https?|file|ftps?)://[{}()@\w/\\\-%?!&.=:;+,#~]*[{}(@\w/\-%&=+#$])|(?:[\w\-%]+(?:\.[\w\-%]+)*\.(?:aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afl|africa|ag|agakhan|agency|ai|aig|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|amazon|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|black|blackfriday|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|casa|case|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|christmas|chrome|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cpa|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dog|domains|dot|download|drive|dtv|dubai|dunlop|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epson|equipment|er|ericsson|erni|es|esq|estate|et|etisalat|eu|eurovision|eus|events|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gay|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|ikano|il|im|imamat|imdb|immo|immobilien|in|inc|industries|infiniti|info|ing|ink|institute|insurance|insure|int|international|intuit|investments|io|ipiranga|iq|ir|irish|is|ismaili|ist|istanbul|it|itau|itv|jaguar|java|jcb|je|jeep|jetzt|jewelry|jio|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|lamborghini|lamer|lancaster|lancia|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|llp|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|moda|moe|moi|mom|monash|money|monster|mormon|mortgage|moscow|moto|motorcycles|mov|movie|mp|mq|mr|ms|msd|mt|mtn|mtr|mu|museum|music|mutual|mv|mw|mx|my|mz|na|nab|nagoya|name|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|pa|page|panasonic|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|racing|radio|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|ril|rio|rip|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|spa|space|sport|spot|sr|srl|ss|st|stada|staples|star|statebank|statefarm|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiss|sx|sy|sydney|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3pxu8k|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4dbrk0ce|xn--4gbrim|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cckwcxetd|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq480n2rg|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbah1a3hjkrd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbbh1a|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgbcpq6gpa1a|xn--mgberp4a5d4ar|xn--mgbgu82a|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--otu796d|xn--p1acf|xn--p1ai|xn--pgbs0dh|xn--pssy2u|xn--q7ce6a|xn--q9jyb4c|xn--qcka1pmc|xn--qxa6a|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zm|zone|zuerich|zw)\b))` + -- 2.45.2