Add options to independently allow or disallow matching of scheme, www, and top level domain urls

gregjacobs · gregjacobs · commit ba438ba3e997 · 2015-11-10T23:21:24.000-05:00
diff --git a/README.md b/README.md
@@ -139,9 +139,29 @@ providing an Object as the second parameter to [Autolinker.link()](http://gregja
   4) Twitter links will have the CSS classes: "myLink myLink-twitter"<br />
   5) Hashtag links will have the CSS classes: "myLink myLink-hashtag"<br />
 
-- [urls](http://gregjacobs.github.io/Autolinker.js/docs/#!/api/Autolinker-cfg-urls) : Boolean<br />
+- [urls](http://gregjacobs.github.io/Autolinker.js/docs/#!/api/Autolinker-cfg-urls) : Boolean/Object<br />
   `true` to have URLs auto-linked, `false` to skip auto-linking of URLs.
-  Defaults to `true`.<br />
+  Defaults to `true`.<br>
+  
+  This option also accepts an Object form with 3 properties, to allow for more
+  customization of what exactly gets linked. All default to `true`:
+   
+    - schemeMatches (Boolean): `true` to match URLs found prefixed with a scheme,
+      i.e. `http://google.com`, or `other+scheme://google.com`, `false` to
+      prevent these types of matches.
+    - wwwMatches (Boolean): `true` to match urls found prefixed with `'www.'`,
+      i.e. `www.google.com`. `false` to prevent these types of matches. Note 
+      that if the URL had a prefixed scheme, and `schemeMatches` is true, it 
+      will still be linked.
+    - tldMatches: `true` to match URLs with known top level domains (.com, .net,
+      etc.) that are not prefixed with a scheme or `'www.'`. This option 
+      attempts to match anything that looks like a URL in the given text. 
+      Ex: `google.com`, `asdf.org/?page=1`, etc. `false` to prevent these types
+      of matches.
+      <br />
+      
+  Example usage: `urls: { schemeMatches: true, wwwMatches: true, tldMatches: false }`
+    
 - [email](http://gregjacobs.github.io/Autolinker.js/docs/#!/api/Autolinker-cfg-email) : Boolean<br />
   `true` to have email addresses auto-linked, `false` to skip auto-linking of
   email addresses. Defaults to `true`.<br /><br />
diff --git a/src/Autolinker.js b/src/Autolinker.js
@@ -113,44 +113,59 @@ var Autolinker = function( cfg ) {
 		throw new Error( "invalid `hashtag` cfg - see docs" );
 	}
 
-	// Normalize the `truncate` option
-	var truncate = this.truncate = this.truncate || {};
-	if( typeof truncate === 'number' ) {
-		this.truncate = { length: truncate, location: 'end' };
-	} else if( typeof truncate === 'object' ) {
-		this.truncate.length = truncate.length || Number.POSITIVE_INFINITY;
-		this.truncate.location = truncate.location || 'end';
-	}
+	// Normalize the configs
+	this.urls     = this.normalizeUrlsCfg( this.urls );
+	this.truncate = this.normalizeTruncateCfg( this.truncate );
 };
 
 Autolinker.prototype = {
 	constructor : Autolinker,  // fix constructor property
 
 	/**
-	 * @cfg {Boolean} urls
-	 *
-	 * `true` if miscellaneous URLs should be automatically linked, `false` if they should not be.
+	 * @cfg {Boolean/Object} urls
+	 *
+	 * `true` if URLs should be automatically linked, `false` if they should not
+	 * be.
+	 *
+	 * This option also accepts an Object form with 3 properties, to allow for
+	 * more customization of what exactly gets linked. All default to `true`:
+	 *
+	 * @param {Boolean} schemeMatches `true` to match URLs found prefixed with a
+	 *   scheme, i.e. `http://google.com`, or `other+scheme://google.com`,
+	 *   `false` to prevent these types of matches.
+	 * @param {Boolean} wwwMatches `true` to match urls found prefixed with
+	 *   `'www.'`, i.e. `www.google.com`. `false` to prevent these types of
+	 *   matches. Note that if the URL had a prefixed scheme, and
+	 *   `schemeMatches` is true, it will still be linked.
+	 * @param {Boolean} tldMatches `true` to match URLs with known top level
+	 *   domains (.com, .net, etc.) that are not prefixed with a scheme or
+	 *   `'www.'`. This option attempts to match anything that looks like a URL
+	 *   in the given text. Ex: `google.com`, `asdf.org/?page=1`, etc. `false`
+	 *   to prevent these types of matches.
 	 */
 	urls : true,
 
 	/**
 	 * @cfg {Boolean} email
 	 *
-	 * `true` if email addresses should be automatically linked, `false` if they should not be.
+	 * `true` if email addresses should be automatically linked, `false` if they
+	 * should not be.
 	 */
 	email : true,
 
 	/**
 	 * @cfg {Boolean} twitter
 	 *
-	 * `true` if Twitter handles ("@example") should be automatically linked, `false` if they should not be.
+	 * `true` if Twitter handles ("@example") should be automatically linked,
+	 * `false` if they should not be.
 	 */
 	twitter : true,
 
 	/**
 	 * @cfg {Boolean} phone
 	 *
-	 * `true` if Phone numbers ("(555)555-5555") should be automatically linked, `false` if they should not be.
+	 * `true` if Phone numbers ("(555)555-5555") should be automatically linked,
+	 * `false` if they should not be.
 	 */
 	phone: true,
 
@@ -288,6 +303,49 @@ Autolinker.prototype = {
 	 */
 	tagBuilder : undefined,
 
+
+	/**
+	 * Normalizes the {@link #urls} config into an Object with 3 properties:
+	 * `schemeMatches`, `wwwMatches`, and `tldMatches`, all Booleans.
+	 *
+	 * See {@link #urls} config for details.
+	 *
+	 * @private
+	 * @param {Boolean/Object} urls
+	 * @return {Object}
+	 */
+	normalizeUrlsCfg : function( urls ) {
+		if( typeof urls === 'boolean' ) {
+			return { schemeMatches: urls, wwwMatches: urls, tldMatches: urls };
+		} else {
+			return Autolinker.Util.defaults( urls || {}, { schemeMatches: true, wwwMatches: true, tldMatches: true } );
+		}
+	},
+
+
+	/**
+	 * Normalizes the {@link #truncate} config into an Object with 2 properties:
+	 * `length` (Number), and `location` (String).
+	 *
+	 * See {@link #truncate} config for details.
+	 *
+	 * @private
+	 * @param {Number/Object} truncate
+	 * @return {Object}
+	 */
+	normalizeTruncateCfg : function( truncate ) {
+		if( typeof truncate === 'number' ) {
+			return { length: truncate, location: 'end' };
+
+		} else {  // object, or undefined/null
+			return Autolinker.Util.defaults( truncate || {}, {
+				length   : Number.POSITIVE_INFINITY,
+				location : 'end'
+			} );
+		}
+	},
+
+
 	/**
 	 * Automatically links URLs, Email addresses, Phone numbers, Twitter
 	 * handles, and Hashtags found in the given chunk of HTML. Does not link
diff --git a/src/Util.js b/src/Util.js
@@ -44,6 +44,25 @@ Autolinker.Util = {
 	},
 
 
+	/**
+	 * Assigns (shallow copies) the properties of `src` onto `dest`, if the
+	 * corresponding property on `dest` === `undefined`.
+	 *
+	 * @param {Object} dest The destination object.
+	 * @param {Object} src The source object.
+	 * @return {Object} The destination object (`dest`)
+	 */
+	defaults : function( dest, src ) {
+		for( var prop in src ) {
+			if( src.hasOwnProperty( prop ) && dest[ prop ] === undefined ) {
+				dest[ prop ] = src[ prop ];
+			}
+		}
+
+		return dest;
+	},
+
+
 	/**
 	 * Extends `superclass` to create a new subclass, adding the `protoProps` to the new subclass's prototype.
 	 *
diff --git a/src/matchParser/MatchParser.js b/src/matchParser/MatchParser.js
@@ -14,7 +14,7 @@
 Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 
 	/**
-	 * @cfg {Boolean} urls
+	 * @cfg {Object} urls
 	 * @inheritdoc Autolinker#urls
 	 */
 	urls : true,
@@ -78,26 +78,31 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 	 *     used to match protocol URLs with just a single word, like 'http://localhost',
 	 *     where we won't double check that the domain name has at least one '.'
 	 *     in it.
-	 * 7.  A protocol-relative ('//') match for the case of a 'www.' prefixed
+	 * 7.  Group that matches a 'www.' prefixed URL. This is only matched if the
+	 *     'www.' text was not prefixed by a scheme (i.e.: not prefixed by
+	 *     'http://', 'ftp:', etc.)
+	 * 8.  A protocol-relative ('//') match for the case of a 'www.' prefixed
 	 *     URL. Will be an empty string if it is not a protocol-relative match.
 	 *     We need to know the character before the '//' in order to determine
 	 *     if it is a valid match or the // was in a string we don't want to
 	 *     auto-link.
-	 * 8.  A protocol-relative ('//') match for the case of a known TLD prefixed
+	 * 9.  Group that matches a known TLD (top level domain), when a scheme
+	 *     or 'www.'-prefixed domain is not matched.
+	 * 10.  A protocol-relative ('//') match for the case of a known TLD prefixed
 	 *     URL. Will be an empty string if it is not a protocol-relative match.
 	 *     See #6 for more info.
-	 * 9.  Group that is used to determine if there is a phone number match.
-	 * 10. If there is a phone number match, and a '+' sign was included with
+	 * 11. Group that is used to determine if there is a phone number match.
+	 * 12. If there is a phone number match, and a '+' sign was included with
 	 *     the phone number, this group will be populated with the '+' sign.
-	 * 11. Group that is used to determine if there is a Hashtag match
+	 * 13. Group that is used to determine if there is a Hashtag match
 	 *     (i.e. \#someHashtag). Simply check for its existence to determine if
 	 *     there is a Hashtag match. The next couple of capturing groups give
 	 *     information about the Hashtag match.
-	 * 12. The whitespace character before the #sign in a Hashtag handle. This
+	 * 14. The whitespace character before the #sign in a Hashtag handle. This
 	 *     is needed because there are no look-behinds in JS regular
 	 *     expressions, and can be used to reconstruct the original string in a
 	 *     replace().
-	 * 13. The Hashtag itself in a Hashtag match. If the match is
+	 * 15. The Hashtag itself in a Hashtag match. If the match is
 	 *     '#someHashtag', the hashtag is 'someHashtag'.
 	 */
 	matcherRegex : (function() {
@@ -135,23 +140,23 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 
 			'(',  // *** Capturing group $5, which is used to match a URL
 				'(?:', // parens to cover match for protocol (optional), and domain
-					'(',  // *** Capturing group $6, for a protocol-prefixed url (ex: http://google.com)
+					'(',  // *** Capturing group $6, for a scheme-prefixed url (ex: http://google.com)
 						protocolRegex.source,
 						domainNameRegex.source,
 					')',
 
 					'|',
 
-					'(?:',  // non-capturing paren for a 'www.' prefixed url (ex: www.google.com)
-						'(.?//)?',  // *** Capturing group $7 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
+					'(',  // *** Capturing group $7, for a 'www.' prefixed url (ex: www.google.com)
+						'(.?//)?',  // *** Capturing group $8 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
 						wwwRegex.source,
 						domainNameRegex.source,
 					')',
 
 					'|',
 
-					'(?:',  // non-capturing paren for known a TLD url (ex: google.com)
-						'(.?//)?',  // *** Capturing group $8 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
+					'(',  // *** Capturing group $9, for known a TLD url (ex: google.com)
+						'(.?//)?',  // *** Capturing group $10 for an optional protocol-relative URL. Must be at the beginning of the string or start with a non-word character
 						domainNameRegex.source,
 						tldRegex.source,
 					')',
@@ -163,17 +168,17 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 			'|',
 
 			// this setup does not scale well for open extension :( Need to rethink design of autolinker...
-			// *** Capturing group $9, which matches a (USA for now) phone number, and
-			// *** Capturing group $10, which matches the '+' sign for international numbers, if it exists
+			// *** Capturing group $11, which matches a (USA for now) phone number, and
+			// *** Capturing group $12, which matches the '+' sign for international numbers, if it exists
 			'(',
 				phoneRegex.source,
 			')',
 
 			'|',
 
-			'(',  // *** Capturing group $11, which can be used to check for a Hashtag match. Use group $12 for the actual Hashtag though. $11 may be used to reconstruct the original string in a replace()
-				// *** Capturing group $12, which matches the whitespace character before the '#' sign (needed because of no lookbehinds), and
-				// *** Capturing group $13, which matches the actual Hashtag
+			'(',  // *** Capturing group $13, which can be used to check for a Hashtag match. Use group $12 for the actual Hashtag though. $11 may be used to reconstruct the original string in a replace()
+				// *** Capturing group $14, which matches the whitespace character before the '#' sign (needed because of no lookbehinds), and
+				// *** Capturing group $15, which matches the actual Hashtag
 				hashtagRegex.source,
 			')'
 		].join( "" ), 'gi' );
@@ -230,8 +235,8 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 	replace : function( text, replaceFn, contextObj ) {
 		var me = this;  // for closure
 
-		return text.replace( this.matcherRegex, function( matchStr, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 ) {
-			var matchDescObj = me.processCandidateMatch( matchStr, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );  // "match description" object
+		return text.replace( this.matcherRegex, function( matchStr/*, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15*/ ) {
+			var matchDescObj = me.processCandidateMatch.apply( me, arguments );  // "match description" object
 
 			// Return out with no changes for match types that are disabled (url,
 			// email, phone, etc.), or for matches that are invalid (false
@@ -271,12 +276,17 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 	 * @param {String} emailAddressMatch The matched email address for an email
 	 *   address match.
 	 * @param {String} urlMatch The matched URL string for a URL match.
-	 * @param {String} protocolUrlMatch The match URL string for a protocol
+	 * @param {String} schemeUrlMatch The match URL string for a protocol
 	 *   match. Ex: 'http://yahoo.com'. This is used to match something like
 	 *   'http://localhost', where we won't double check that the domain name
 	 *   has at least one '.' in it.
+	 * @param {String} wwwMatch The matched string of a 'www.'-prefixed URL that
+	 *   was matched. This is only matched if the 'www.' text was not prefixed
+	 *   by a scheme (i.e.: not prefixed by 'http://', 'ftp:', etc.).
 	 * @param {String} wwwProtocolRelativeMatch The '//' for a protocol-relative
 	 *   match from a 'www' url, with the character that comes before the '//'.
+	 * @param {String} tldMatch The matched string of a known TLD (top level
+	 *   domain), when a scheme or 'www.'-prefixed domain is not matched.
 	 * @param {String} tldProtocolRelativeMatch The '//' for a protocol-relative
 	 *   match from a TLD (top level domain) match, with the character that
 	 *   comes before the '//'.
@@ -308,8 +318,8 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 	 */
 	processCandidateMatch : function(
 		matchStr, twitterMatch, twitterHandlePrefixWhitespaceChar, twitterHandle,
-		emailAddressMatch, urlMatch, protocolUrlMatch, wwwProtocolRelativeMatch,
-		tldProtocolRelativeMatch, phoneMatch, phonePlusSignMatch, hashtagMatch,
+		emailAddressMatch, urlMatch, schemeUrlMatch, wwwMatch, wwwProtocolRelativeMatch,
+		tldMatch, tldProtocolRelativeMatch, phoneMatch, phonePlusSignMatch, hashtagMatch,
 		hashtagPrefixWhitespaceChar, hashtag
 	) {
 		// Note: The `matchStr` variable wil be fixed up to remove characters that are no longer needed (which will
@@ -319,19 +329,23 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 		    match,  // Will be an Autolinker.match.Match object
 
 		    prefixStr = "",  // A string to use to prefix the anchor tag that is created. This is needed for the Twitter and Hashtag matches.
-		    suffixStr = "";  // A string to suffix the anchor tag that is created. This is used if there is a trailing parenthesis that should not be auto-linked.
+		    suffixStr = "",  // A string to suffix the anchor tag that is created. This is used if there is a trailing parenthesis that should not be auto-linked.
+
+		    urls = this.urls;  // the 'urls' config
 
 		// Return out with `null` for match types that are disabled (url, email,
 		// twitter, hashtag), or for matches that are invalid (false positives
 		// from the matcherRegex, which can't use look-behinds since they are
 		// unavailable in JS).
 		if(
-			( urlMatch && !this.urls ) ||
+			( schemeUrlMatch && !urls.schemeMatches ) ||
+			( wwwMatch && !urls.wwwMatches ) ||
+			( tldMatch && !urls.tldMatches ) ||
 			( emailAddressMatch && !this.email ) ||
 			( phoneMatch && !this.phone ) ||
 			( twitterMatch && !this.twitter ) ||
 			( hashtagMatch && !this.hashtag ) ||
-			!this.matchValidator.isValidMatch( urlMatch, protocolUrlMatch, protocolRelativeMatch )
+			!this.matchValidator.isValidMatch( urlMatch, schemeUrlMatch, protocolRelativeMatch )
 		) {
 			return null;
 		}
@@ -344,7 +358,7 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 			suffixStr = ")";  // this will be added after the generated <a> tag
 		} else {
 			// Handle an invalid character after the TLD
-			var pos = this.matchHasInvalidCharAfterTld( urlMatch, protocolUrlMatch );
+			var pos = this.matchHasInvalidCharAfterTld( urlMatch, schemeUrlMatch );
 			if( pos > -1 ) {
 				suffixStr = matchStr.substr(pos);  // this will be added after the generated <a> tag
 				matchStr = matchStr.substr( 0, pos ); // remove the trailing invalid chars
@@ -396,7 +410,7 @@ Autolinker.matchParser.MatchParser = Autolinker.Util.extend( Object, {
 			match = new Autolinker.match.Url( {
 				matchedText : matchStr,
 				url : matchStr,
-				protocolUrlMatch : !!protocolUrlMatch,
+				protocolUrlMatch : !!schemeUrlMatch,
 				protocolRelativeMatch : !!protocolRelativeMatch,
 				stripPrefix : this.stripPrefix
 			} );
diff --git a/tests/AutolinkerSpec.js b/tests/AutolinkerSpec.js
diff --git a/tests/UtilSpec.js b/tests/UtilSpec.js