package org.apache.maven.wagon.shared.http;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.commons.io.IOUtils;
import org.apache.maven.wagon.TransferFailedException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

Html File List Parser.
/** * Html File List Parser. */
public class HtmlFileListParser { // Apache Fancy Index Sort Headers private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); // URLs with excessive paths. private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); // URLs that to a parent directory. private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); // mailto urls private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); private static final Pattern[] SKIPS = new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
Params:
  • stream – the input stream.
Throws:
Returns:the file list.
/** * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. * * @param stream the input stream. * @return the file list. * @throws TransferFailedException if there was a problem fetching the raw html. */
public static List<String> parseFileList( String baseurl, InputStream stream ) throws TransferFailedException { try { URI baseURI = new URI( baseurl ); // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe // assumption. String content = IOUtils.toString( stream, "utf-8" ); Document doc = Jsoup.parse( content, baseurl ); Elements links = doc.select( "a[href]" ); Set<String> results = new HashSet<String>(); for ( Element link : links ) { /* * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink */ String target = link.attr( "href" ); if ( target != null ) { String clean = cleanLink( baseURI, target ); if ( isAcceptableLink( clean ) ) { results.add( clean ); } } } return new ArrayList<String>( results ); } catch ( URISyntaxException e ) { throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e ); } catch ( IOException e ) { throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e ); } } private static String cleanLink( URI baseURI, String link ) { if ( link == null || link.length() == 0 ) { return ""; } String ret = link; try { URI linkuri = new URI( ret ); if ( link.startsWith( "/" ) ) { linkuri = baseURI.resolve( linkuri ); } URI relativeURI = baseURI.relativize( linkuri ).normalize(); ret = relativeURI.toASCIIString(); if ( ret.startsWith( baseURI.getPath() ) ) { ret = ret.substring( baseURI.getPath().length() ); } ret = URLDecoder.decode( ret, "UTF-8" ); } catch ( URISyntaxException e ) { // ignore } catch ( UnsupportedEncodingException e ) { // ignore } return ret; } private static boolean isAcceptableLink( String link ) { if ( link == null || link.length() == 0 ) { return false; } for ( Pattern pattern : SKIPS ) { if ( pattern.matcher( link ).find() ) { return false; } } return true; } }