A Demo of the Regular Expression Advanced Features

The following application consolidates the key principles that have been discussed in the article, namely:

Specifically, this ASP.NET Web page scrapes a Web page and picks out anchors that are not mailto links and are not fully qualified with http://, and prefixes them with http://www.4GuysFromRolla.com/.


Enter a Url:



Source Code
<%@ Page Language="vb" %>
<%@ Import Namespace="System" %>
<%@ Import Namespace="System.Text.RegularExpressions" %>
<%@ Import Namespace="System.Text" %>
<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>

<script language="VB" runat="server">
    Private report As New StringBuilder()
    Private webPage As String
    Private countOfMatches As Int32

    Private Sub scrapeButton_Click( ByVal sender As System.Object, ByVal e As System.EventArgs )
        webPage = GrabUrl()
        Dim myDelegate As New MatchEvaluator( AddressOf MatchHandler )

        Dim linksExpression As New Regex( _
		"\<a				(?# Find the opening ANCHOR tag )" & _
		".+?				(?# followed, minimally by everything up to the href attribute ) " & _
		"href=['""]			(?# up to the opening Href attribute ) " & _
		"(?!http\:\/\/)			(?# assert that the next sequence is not Http://) " & _
		"(?!mailto\:)			(?# ...or mailto:) " & _
		"(?<foundAnchor>[^'"">]+?)	(?# now, match everything up to the next ' or "" into a group named 'foundAnchor') " & _
		"[^>]*?				(?# followed, minimally by everything up to the closing tag ) " & _
		"\>				(?# then the end of the opening ANCHOR tag)", _ 
		RegexOptions.Multiline Or _
		RegexOptions.IgnoreCase Or _
		RegExOptions.IgnorePatternWhitespace _
	)

        Dim newWebPage As String = linksExpression.Replace( webPage, myDelegate )

        resultLabel.Text = "<h2>Report Result for " & urlTextBox.Text & "</h2>" & _
            "<b>Found and fixed the following " & countOfMatches.ToString() & " anchors...</b><br><br>" & _ 
            report.ToString().Replace( Environment.NewLine, "<br>" )

	resultLabel.Text &= "<h2>Fixed Page</h2>" & Server.HtmlEncode( newWebPage )
    End Sub


    Private Function MatchHandler( ByVal m As Match ) As String
        Dim link As String = m.Groups( "foundAnchor" ).Value
        Dim rToL As New Regex( "^", RegexOptions.Multiline Or RegexOptions.RightToLeft )
        Dim col, row As Int32
        Dim lineBegin As Int32 = rToL.Match( webPage, m.Index ).Index

        row = rToL.Matches( webPage, m.Index ).Count
        col = m.Index - lineBegin

        report.AppendFormat( _
            "Link <b>{0}</b>, fixed at row: {1}, col: {2}{3}", _
            Server.HtmlEncode(m.Groups(0).Value), _
            row, _
            col, _
            Environment.NewLine _
        )
	Dim newLink As String 
	If link.StartsWith("/") Then 
            newLink = link.Substring(1)
	Else
            newLink = link
	End If
	
	countOfMatches += 1
        Return m.Groups(0).Value.Replace( link, "http://www.4guysfromrolla.com/" & newLink )
    End Function

    Private Function GrabUrl() As String
        Dim wc As New WebClient()
        'TO DO:  Implement url validity check on Url value
        Dim s As Stream = wc.OpenRead( urlTextBox.Text )
        Dim sr As StreamReader = New StreamReader( s )
        GrabUrl = sr.ReadToEnd
        s.Close()
        wc.Dispose()
    End Function

</script>

<form id="Form1" method="post" runat="server">
    <P>
        <asp:Label id="Label1" runat="server">Enter a Url: </asp:Label>
        <asp:TextBox id="urlTextBox" runat="server" Width="336px">http://www.4guysfromrolla.com/</asp:TextBox>
        <asp:Button OnClick="scrapeButton_Click" id="scrapeButton" runat="server" Text="Scrape..."></asp:Button></P>
    <HR width="100%" SIZE="1">
<P>
        <asp:Label id="resultLabel" runat="server"></asp:Label></P>
</form>


[Return to the article]