~dricottone/parcels

f0c1b1cf25d9761fd02882bfd27a83a4f389b8ca — Dominic Ricottone 2 years ago df2d621 v1.1.0
Release 1.1

Retroactively tagging the previous commit as 1.0. 1.1 is the first tag
which will actually be importable as a module, but the project was fully
operational prior to this point, and I may need to reference the old
version in case any regressions surface.
8 files changed, 141 insertions(+), 79 deletions(-)

M Makefile
A common/parcels.go
R regexp.go => common/regexp.go
M main.go
R build/regexp.go-part1 => scripts/regexp.go-part1
R build/regexp.go-part2 => scripts/regexp.go-part2
R build/regexp.go-part3 => scripts/regexp.go-part3
R build/top-level-domains.txt => scripts/top-level-domains.txt
M Makefile => Makefile +11 -11
@@ 2,24 2,24 @@ INSTALL_DIR?=/usr/local/bin
URL_ICANN=https://data.iana.org/TLD/tlds-alpha-by-domain.txt
GO_FILES=$(shell find -name '*.go')

build/top-level-domains.txt:
	curl -o build/top-level-domains.txt $(URL_ICANN)
scripts/top-level-domains.txt:
	curl -o scripts/top-level-domains.txt $(URL_ICANN)

regexp.go: build/top-level-domains.txt
	tail +2 build/top-level-domains.txt \
common/regexp.go: scripts/top-level-domains.txt
	tail +2 scripts/top-level-domains.txt \
		| perl -pe 'chomp if eof' \
		| tr '[:upper:]' '[:lower:]' \
		| tr '\n' '|' > build/regexp.go-part2
	cat <(perl -pe 'chomp if eof' build/regexp.go-part1) \
		<(perl -pe 'chomp if eof' build/regexp.go-part2) \
		build/regexp.go-part3 \
		> regexp.go
		| tr '\n' '|' > scripts/regexp.go-part2
	cat <(perl -pe 'chomp if eof' scripts/regexp.go-part1) \
		<(perl -pe 'chomp if eof' scripts/regexp.go-part2) \
		scripts/regexp.go-part3 \
		> common/regexp.go

.PHONY: clean
clean:
	rm -f parcels regexp.go go.sum
	rm -f parcels common/regexp.go go.sum

parcels: $(GO_FILES) regexp.go
parcels: common/regexp.go $(GO_FILES)
	go build

.PHONY: build

A common/parcels.go => common/parcels.go +117 -0
@@ 0,0 1,117 @@
package common

import (
	"fmt"
	"io"
	"bufio"
	"strings"
	"regexp"
)

var urlPattern = regexp.MustCompile(UrlPattern)

// Modify a string such that replacement occupies it from the beginning index
// to the end index.
func replace(str string, beginning int, end int, replacement int) string {
	return str[:beginning] + fmt.Sprintf("[%d]", replacement) + str[end:]
}

// Pull a URL from a scanner.
func pullFromScanner(scanner *bufio.Scanner, target int) (string, error) {
	target_url := ""
	count_urls_skipped := 0

	for scanner.Scan() {
		// find all matches (and count of matches) on this line
		line := scanner.Text()
		matches := urlPattern.FindAllStringIndex(line, -1)
		count_urls_after_line := count_urls_skipped + len(matches)

		// if target url is on this line, pull it from matches
		if target < count_urls_after_line {
			target_beg := matches[target - count_urls_skipped][0]
			target_end := matches[target - count_urls_skipped][1]
			target_url = line[target_beg:target_end]
			break
		}

		// else update count skipped and go to next line
		count_urls_skipped = count_urls_after_line
	}

	// Check for scanner errors
	if err := scanner.Err(); err != nil {
		return target_url, err
	}

	return target_url, nil
}

// Parse URLs from a scanner. Return two strings: the modified content of the
// scanner, and a list of URLs. Use offset to indicate how many URLs preceded
// this scanner.
func parseFromScanner(scanner *bufio.Scanner, offset int) (string, string, error) {
	var content strings.Builder
	var urls strings.Builder
	cursor := offset

	for scanner.Scan() {
		// find all matches (and count of matches) on this line
		line := scanner.Text()
		matches := urlPattern.FindAllStringIndex(line, -1)
		count := len(matches)
		var new_urls = make([]string, count)

		// looping backwards, extract each URL and replace it in the
		// content
		for i := count - 1; i >= 0; i-- {
			target_beg := matches[i][0]
			target_end := matches[i][1]
			new_urls[i] = line[target_beg:target_end]
			line = replace(line, target_beg, target_end, cursor+i)
		}

		// update the content
		content.WriteString(line)
		content.WriteString("\n")

		// update the list of urls
		for i, url := range new_urls {
			urls.WriteString(fmt.Sprintf("[%d] %s\n", cursor+i, url))
		}
	}

	// Check for scanner errors
	if err := scanner.Err(); err != nil {
		return content.String(), urls.String(), err
	}

	return content.String(), urls.String(), nil
}

// Pull a URL from a reader. Use offset to indicate how many URLs preceded this
// reader.
func PullFromReader(reader io.Reader, target int, offset int) (string, error) {
	return pullFromScanner(bufio.NewScanner(reader), target-offset)
}

// Pull a URL from a string. Use offset to indicate how many URLs preceded this
// string.
func PullFromString(str string, target int, offset int) (string, error) {
	return PullFromReader(strings.NewReader(str), target, offset)
}

// Parse URLs from a reader. Return two strings: the modified content of the
// reader and a list of URLs. Use offset to indicate how many URLs preceded
// this reader.
func ParseFromReader(reader io.Reader, offset int) (string, string, error) {
	return parseFromScanner(bufio.NewScanner(reader), offset)
}

// Parse URLs from a string. Return two strings: the modified content of the
// original string and a list of URLs. Use offset to indicate how many URLs
// preceded the original string.
func ParseFromString(str string, offset int) (string, string, error) {
	return ParseFromReader(strings.NewReader(str), offset)
}


R regexp.go => common/regexp.go +1 -1
@@ 1,4 1,4 @@
package main
package common

var UrlPattern = `:?((?:(?:https?|file|ftps?)://[{}()@\w/\\\-%?!&.=:;+,#~]*[{}(@\w/\-%&=+#$])|(?:[\w\-%]+(?:\.[\w\-%]+)*\.(?:aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afl|africa|ag|agakhan|agency|ai|aig|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|amazon|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|black|blackfriday|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|casa|case|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|christmas|chrome|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cpa|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dog|domains|dot|download|drive|dtv|dubai|dunlop|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epson|equipment|er|ericsson|erni|es|esq|estate|et|etisalat|eu|eurovision|eus|events|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gay|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|ikano|il|im|imamat|imdb|immo|immobilien|in|inc|industries|infiniti|info|ing|ink|institute|insurance|insure|int|international|intuit|investments|io|ipiranga|iq|ir|irish|is|ismaili|ist|istanbul|it|itau|itv|jaguar|java|jcb|je|jeep|jetzt|jewelry|jio|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|lamborghini|lamer|lancaster|lancia|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|llp|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|moda|moe|moi|mom|monash|money|monster|mormon|mortgage|moscow|moto|motorcycles|mov|movie|mp|mq|mr|ms|msd|mt|mtn|mtr|mu|museum|music|mutual|mv|mw|mx|my|mz|na|nab|nagoya|name|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|pa|page|panasonic|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|racing|radio|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|ril|rio|rip|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|spa|space|sport|spot|sr|srl|ss|st|stada|staples|star|statebank|statefarm|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiss|sx|sy|sydney|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3pxu8k|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4dbrk0ce|xn--4gbrim|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cckwcxetd|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq480n2rg|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbah1a3hjkrd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbbh1a|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgbcpq6gpa1a|xn--mgberp4a5d4ar|xn--mgbgu82a|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--otu796d|xn--p1acf|xn--p1ai|xn--pgbs0dh|xn--pssy2u|xn--q7ce6a|xn--q9jyb4c|xn--qcka1pmc|xn--qxa6a|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zm|zone|zuerich|zw)\b))`


M main.go => main.go +11 -66
@@ 4,84 4,30 @@ import (
	"fmt"
	"os"
	"io"
	"bufio"
	"regexp"
	"flag"
)

func Replacement(i int) string {
	return fmt.Sprintf("[%d]", i)
}
	"git.dominic-ricottone.com/~dricottone/parcels/common"
)

func find_in_stream(reader io.Reader, target int) {
	// Create scanner from reader
	input := bufio.NewScanner(reader)

	// Initialize state
	re := regexp.MustCompile(UrlPattern)
	count := 0

	// Parse and print
	for input.Scan() {
		line := input.Text()
		line_indices := re.FindAllStringIndex(line, -1)
		count_after := count + len(line_indices)

		if target < count_after {
			beg, end := line_indices[target-count][0], line_indices[target-count][1]
			fmt.Println(line[beg:end])
			break
		}

		count = count_after
	}

	// Check for scanner errors
	if err := input.Err(); err != nil {
	url, err := common.PullFromReader(reader, target, 0)
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}

	fmt.Println(url)
}

func parse_stream(reader io.Reader) {
	// Create scanner from reader
	input := bufio.NewScanner(reader)

	// Initialize state
	re := regexp.MustCompile(UrlPattern)
	var parcels [999]string //assuming that never need 1000+ URLs
	offset := 0

	// Parse, modify, and print
	for input.Scan() {
		line := input.Text()

		line_indices := re.FindAllStringIndex(line, -1)
		for i := len(line_indices)-1; i >= 0; i-- {
			beg, end := line_indices[i][0], line_indices[i][1]
			parcels[offset+i] = line[beg:end]
			line = line[:beg] + Replacement(offset+i) + line[end:]
		}

		fmt.Println(line)

		offset = offset + len(line_indices)
	}

	// Print postscript
	fmt.Printf("\nURLs:\n")
	for index, url := range parcels {
		if url == "" {
			break
		}
		fmt.Printf("[%d] %s\n", index, url)
	}

	// Check for scanner errors
	if err := input.Err(); err != nil {
	content, urls, err := common.ParseFromReader(reader, 0)
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%s", content)
	fmt.Printf("%s", urls)
}

func parse_file(filename string) {


@@ 93,7 39,6 @@ func parse_file(filename string) {
	}
	defer file.Close()

	// Parse
	parse_stream(file)
}


R build/regexp.go-part1 => scripts/regexp.go-part1 +1 -1
@@ 1,3 1,3 @@
package main
package common

var UrlPattern = `:?((?:(?:https?|file|ftps?)://[{}()@\w/\\\-%?!&.=:;+,#~]*[{}(@\w/\-%&=+#$])|(?:[\w\-%]+(?:\.[\w\-%]+)*\.(?:

R build/regexp.go-part2 => scripts/regexp.go-part2 +0 -0
R build/regexp.go-part3 => scripts/regexp.go-part3 +0 -0
R build/top-level-domains.txt => scripts/top-level-domains.txt +0 -0