package websurvey;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nu.validator.htmlparser.common.DoctypeExpectation;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.dom.HtmlDocumentBuilder;
import nu.validator.htmlparser.sax.HtmlParser;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.xml.serializer.Method;
import org.apache.xml.serializer.OutputPropertiesFactory;
import org.apache.xml.serializer.Serializer;
import org.apache.xml.serializer.SerializerFactory;
import org.apache.xml.serializer.TreeWalker;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

class DiskCache {
	private static final String tmpDir = "/local/scratch/pjt47/html/websurvey-cache/";
	
	private String find(String key) {
		try {
			MessageDigest digest = MessageDigest.getInstance("MD5");
			byte[] hash = digest.digest(key.getBytes("utf8"));
			StringBuffer hex = new StringBuffer();
			for (byte b : hash)
				hex.append(Integer.toHexString(0xFF & b));
			
			return DiskCache.tmpDir + hex.substring(0, 2) + "/" + hex.substring(2, 15);
		} catch (NoSuchAlgorithmException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public synchronized Object get(String key) {
		try {
			String filename = find(key);
			if (filename == null)
				return null;
			File file = new File(filename);
			if (file.exists()) {
				FileInputStream fis = new FileInputStream(file);
				ObjectInputStream ois = new ObjectInputStream(fis);
				Object obj = ois.readObject();
				ois.close();
				fis.close();
				return obj;
			}
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}
		return null;
	}

	public synchronized void set(String key, Object value) {
		try {
			String filename = find(key);
			if (filename == null)
				return;
			File dir = new File(filename.substring(0, filename.lastIndexOf("/")));
			dir.mkdirs();
			File file = new File(filename);
			FileOutputStream fos = new FileOutputStream(file);
			ObjectOutputStream oos = new ObjectOutputStream(fos);
			oos.writeObject(value);
			oos.close();
			fos.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

class CachedHttpClient {
	public static class Resource implements Serializable {
		private static final long serialVersionUID = 8262118816275659429L;
		IOException exception;
		int statusCode;
		String statusLine;
		String documentUri;
		Header[] headers;
		String encoding;
		byte[] body;
	}
	
	private HttpClient httpClient;
	private DiskCache cache;
	
	public CachedHttpClient(HttpClient httpClient) {
		this.httpClient = httpClient;
		this.cache = new DiskCache();
	}
	
	public Resource getResource(String sourceUri) throws IOException {
		// TODO: there's an annoying (but usually harmless?) race condition here,
		// if get returns null
		Resource res = (Resource) this.cache.get(sourceUri);
		if (res == null) {
			res = new Resource();
			GetMethod method = null;
			try {
				method = new GetMethod(sourceUri);
				method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false));
		
				this.httpClient.executeMethod(method);
				
				res.statusCode = method.getStatusCode(); 
				res.statusLine = method.getStatusLine().toString();
				res.documentUri = method.getURI().getURI();
				res.headers = method.getResponseHeaders();
				res.encoding = method.getResponseCharSet();
				res.body = method.getResponseBody(1024*256);
			} catch (IOException e) {
				res.exception = e;
			} finally {
				if (method != null)
					method.releaseConnection();
			}
			this.cache.set(sourceUri, res);
		}
		if (res.exception != null)
			throw res.exception;
		return res;
	}
}

class Reporter {
	
	private static final Pattern invalidChars = Pattern.compile("[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]");

	synchronized public void report(String element, String... attributes) throws SAXException {
		report(null, element, attributes);
	}

	synchronized public void report(Node node) throws SAXException {
		new TreeWalker(this.outHandler).traverse(node);
	}

	synchronized public void report(Node node, String element, String... attributes) throws SAXException {
		AttributesImpl attrs = new AttributesImpl();
		for (int i = 0; i < attributes.length; i += 2) {
			if (attributes[i + 1] != null) {
				String value = invalidChars.matcher(attributes[i+1]).replaceAll(" ");
				attrs.addAttribute(null, null, attributes[i], null, value);
			}
		}
		this.outHandler.startElement(null, null, element, attrs);
		if (node != null)
			this.report(node);
		this.outHandler.endElement(null, null, element);
	}

	synchronized public void report(Node node, ContentHandler handler) throws SAXException {
		new TreeWalker(handler).traverse(node);
	}

	private OutputStream out;
	private ContentHandler outHandler;

	public ContentHandler getContentHandler() {
		return outHandler;
	}

	synchronized public void beginOutput(String filename) throws IOException, SAXException {
		java.util.Properties props = OutputPropertiesFactory.getDefaultMethodProperties(Method.XML);
		props.setProperty("indent", "yes");
		props.setProperty("omit-xml-declaration", "yes");
		Serializer ser = SerializerFactory.getSerializer(props);
		this.out = new FileOutputStream(filename);
		ser.setOutputStream(this.out);

		this.outHandler = ser.asContentHandler();
		this.outHandler.startDocument();
		this.outHandler.startElement(null, null, "survey", null);
	}

	synchronized public void finishOutput() throws SAXException {
		this.outHandler.endElement(null, null, "survey");
		this.outHandler.endDocument();
	}

}

abstract class PageProcessor implements Runnable {
	protected CachedHttpClient httpClient;
	protected String sourceUri;
	protected String documentUri; // may differ from sourceUri if redirects were followed
	protected Reporter reporter;

	public void initialise(CachedHttpClient httpClient, String sourceUri, Reporter reporter) {
		this.httpClient = httpClient;
		this.sourceUri = sourceUri;
		this.reporter = reporter;
	}

	public void run() {
		try {
			CachedHttpClient.Resource res = this.httpClient.getResource(this.sourceUri);

			this.documentUri = res.documentUri;
			if (res.statusCode != HttpStatus.SC_OK) {
				System.err.println("GET failed: " + res.statusLine);
				reporter.report("error", "uri", this.sourceUri, "message", "HTTP error: " + res.statusLine);
				return;
			}
			
			processPage(res);

		} catch (IOException e) {
			System.err.println("Error on " + this.sourceUri + " : " + e.getMessage());
		} catch (Exception e) {
			System.err.println("Error on " + this.sourceUri + " : " + e.getMessage());
			e.printStackTrace();
		}
	}
	
	protected void reportHeaders(Header[] headers) throws SAXException {
		if (! this.sourceUri.equals(this.documentUri))
			this.reporter.report("redirect", "uri", this.sourceUri, "destination", this.documentUri);
		for (Header h : headers)
			this.reporter.report("header", "uri", this.sourceUri, "name", h.getName(), "value", h.getValue());
	}

	protected String findHeader(Header[] headers, String name) throws SAXException {
		for (Header h : headers)
			if (h.getName().equals(name))
				return h.getValue();
		return null;
	}

	static protected String extractContentType(String header) {
		if (header == null)
			return null;
		int i = header.indexOf(';');
		if (i == -1)
			return header;
		return header.substring(0, i);
	}

	abstract void processPage(CachedHttpClient.Resource resource) throws Exception;

	interface Factory {
		PageProcessor create();
	}
}

class AttributeExtractor extends DefaultHandler {
	
	private Reporter reporter;
	private String sourceUri;
	
	private static final Set<String> interestingAttributes = new TreeSet<String>(Arrays.asList(
/*
        "align",
        "bgcolor",
        "clear",
        "color",
        "face",
        "height",
        "hreflang",
        "http-equiv",
        "lang",
        "language",
        "maxlength",
        "media",
        "name",
        "rel",
        "rev",
        "size",
        "target",
        "type",
        "valign",
        "width"
*/
/*        "action", "href", "src", "usemap", "classid", "codebase", "data", "cite", "profile"*/
        "method"
    ));

	public AttributeExtractor(Reporter reporter, String sourceUri) {
		this.reporter = reporter;
		this.sourceUri = sourceUri;
	}
	
	public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
		for (int i = 0; i < attributes.getLength(); ++i) {
			String attrName = attributes.getLocalName(i);
			if (interestingAttributes.contains(attrName))
				this.reporter.report("attribute", "uri", this.sourceUri, "element", name, "name", attrName, "value", attributes.getValue(i));
		}
	}
	
}

class ChainedHandler implements ContentHandler {

	private ContentHandler parent;
	
	ChainedHandler(ContentHandler parent) {
		this.parent = parent;
	}

	public void characters(char[] ch, int start, int length) throws SAXException {
		parent.characters(ch, start, length);
	}
	public void endDocument() throws SAXException {
		parent.endDocument();
	}
	public void endElement(String uri, String localName, String name) throws SAXException {
		parent.endElement(uri, localName, name);
	}
	public void endPrefixMapping(String prefix) throws SAXException {
		parent.endPrefixMapping(prefix);
	}
	public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
		parent.ignorableWhitespace(ch, start, length);
	}
	public void processingInstruction(String target, String data) throws SAXException {
		parent.processingInstruction(target, data);
	}
	public void setDocumentLocator(Locator locator) {
		parent.setDocumentLocator(locator);
	}
	public void skippedEntity(String name) throws SAXException {
		parent.skippedEntity(name);
	}
	public void startDocument() throws SAXException {
		parent.startDocument();
	}
	public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
		parent.startElement(uri, localName, name, atts);
	}
	public void startPrefixMapping(String prefix, String uri) throws SAXException {
		parent.startPrefixMapping(prefix, uri);
	}

}

class MetadataExtractor extends ChainedHandler {
	
	private Reporter reporter;
	private String sourceUri;

	public MetadataExtractor(ContentHandler parent, Reporter reporter, String sourceUri) {
		super(parent);
		this.reporter = reporter;
		this.sourceUri = sourceUri;
	}
	
	public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
		if ("meta".equals(localName)) {
			//String metaName = attributes.getValue("name");
			//if (metaName != null && metaName.toLowerCase().equals("generator")) {
			//	String metaContent = attributes.getValue("content");
			//	this.reporter.report("meta", "uri", this.sourceUri, "name", metaName, "content", metaContent);
			//}
			this.reporter.report("meta", "uri", this.sourceUri, "http-equiv", attributes.getValue("http-equiv"), "name", attributes.getValue("name"), "content", attributes.getValue("content"));
		}
		super.startElement(uri, localName, name, attributes);
	}
}	

class TagCounter extends ChainedHandler {
	
	private Reporter reporter;
	private String sourceUri;
	private HashMap<String, Integer> counts = new HashMap<String, Integer>();

	public TagCounter(ContentHandler parent, Reporter reporter, String sourceUri) {
		super(parent);
		this.reporter = reporter;
		this.sourceUri = sourceUri;
	}
	
	public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
		Integer n = counts.get(localName);
		if (n == null)
			n = 0;
		counts.put(localName, n+1);
		
		super.startElement(uri, localName, name, attributes);
	}
	
	public void endDocument() throws SAXException {
		for (String tag : counts.keySet()) {
			this.reporter.report("tag", "uri", this.sourceUri, "name", tag, "count", counts.get(tag).toString());
		}
		
		super.endDocument();
	}
}

class PageDownloader2 extends PageProcessor {

	public void processPage(CachedHttpClient.Resource resource) throws Exception {
		reportHeaders(resource.headers);
		
		String contentType = findHeader(resource.headers, "Content-Type");
		if (!"text/html".equals(extractContentType(contentType))) {
			// TODO: abort the download for non-text/html resources
			return;
		}

		InputStream responseBody = new ByteArrayInputStream(resource.body);

		HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
		ContentHandler handler = new AttributeExtractor(this.reporter, this.sourceUri);
		parser.setContentHandler(handler);
		InputSource inputSource = new InputSource(responseBody);
		inputSource.setEncoding(resource.encoding);
		parser.parse(inputSource);
	}
}

class PageDownloader3 extends PageProcessor {

	public void processPage(CachedHttpClient.Resource resource) throws Exception {
		reportHeaders(resource.headers);
		
		String contentType = findHeader(resource.headers, "Content-Type");
		if (!"text/html".equals(extractContentType(contentType))) {
			// TODO: abort the download for non-text/html resources
			return;
		}

		InputStream responseBody = new ByteArrayInputStream(resource.body);

		HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
		ContentHandler handler1 = new MetadataExtractor(new DefaultHandler(), this.reporter, this.sourceUri);
		ContentHandler handler2 = new TagCounter(handler1, this.reporter, this.sourceUri);
		parser.setContentHandler(handler2);
		InputSource inputSource = new InputSource(responseBody);
		inputSource.setEncoding(resource.encoding);
		parser.parse(inputSource);
	}
}

class ParseErrorReporter implements ErrorHandler
{
	private Reporter reporter;
	private String sourceUri;
	
	public ParseErrorReporter(Reporter reporter, String sourceUri) {
		this.reporter = reporter;
		this.sourceUri = sourceUri;
	}
	
	public void error(SAXParseException exception) throws SAXException {
		this.reporter.report("parseerror", "uri", sourceUri, "type", "error", "message", exception.getMessage());
	}
	
	public void fatalError(SAXParseException exception) throws SAXException {
		this.reporter.report("parseerror", "uri", sourceUri, "type", "fatalError", "message", exception.getMessage());
	}
	
	public void warning(SAXParseException exception) throws SAXException {
		this.reporter.report("parseerror", "uri", sourceUri, "type", "warning", "message", exception.getMessage());
	}
}

class ParseErrorLister extends PageProcessor {

	public void processPage(CachedHttpClient.Resource resource) throws Exception {
		reportHeaders(resource.headers);
		
		String contentType = findHeader(resource.headers, "Content-Type");
		if (!"text/html".equals(extractContentType(contentType))) {
			// TODO: abort the download for non-text/html resources
			return;
		}
		
		this.reporter.report("processed", "uri", this.sourceUri);

		InputStream responseBody = new ByteArrayInputStream(resource.body);

		HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW);
		ErrorHandler handler = new ParseErrorReporter(this.reporter, this.sourceUri);
		parser.setContentHandler(new DefaultHandler());
		parser.setErrorHandler(handler);
		parser.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS);
		InputSource inputSource = new InputSource(responseBody);
		inputSource.setEncoding(resource.encoding);
		parser.parse(inputSource);
	}
}

class AbbreviationLister extends PageProcessor {

	public void processPage(CachedHttpClient.Resource resource) throws Exception {
		reportHeaders(resource.headers);
		
		String contentType = findHeader(resource.headers, "Content-Type");
		if (!"text/html".equals(extractContentType(contentType))) {
			return;
		}
		
		this.reporter.report("processed", "uri", this.sourceUri);

		InputStream responseBody = new ByteArrayInputStream(resource.body);

		HtmlDocumentBuilder parser = new HtmlDocumentBuilder(XmlViolationPolicy.ALTER_INFOSET);
		InputSource inputSource = new InputSource(responseBody);
		inputSource.setEncoding(resource.encoding);
		Document doc = parser.parse(inputSource);
		String[] types = { "abbr", "acronym" };
		for (String type : types)
		{
			NodeList elements = doc.getElementsByTagName(type);
			for (int i = 0; i < elements.getLength(); ++i) {
				Node title = elements.item(i).getAttributes().getNamedItem("title");
				String titleStr = title == null ? "" : title.getTextContent(); 
				String text = elements.item(i).getTextContent();
				this.reporter.report(elements.item(i), "abbreviation", "uri", sourceUri, "type", type, "title", titleStr, "text", text);
			}
		}
	}
}

class PageRegexProcessor extends PageProcessor {

	private final Pattern p;
	private final boolean onlyTexHtml;
	public PageRegexProcessor(String pattern, boolean onlyTextHtml) {
		p = Pattern.compile(pattern);
		this.onlyTexHtml = onlyTextHtml;
	}
	public void processPage(CachedHttpClient.Resource resource) throws Exception {
		reportHeaders(resource.headers);
		
		String contentType = findHeader(resource.headers, "Content-Type");
		if (onlyTexHtml && !"text/html".equals(extractContentType(contentType))) {
			return;
		}

		String body;
		try {
			body = new String(resource.body, resource.encoding);
		} catch (UnsupportedEncodingException e) {
			body = new String(resource.body, "iso-8859-1");
		}
		Matcher m = p.matcher(body);
		while (m.find()) {
			this.reporter.report("match", "uri", this.sourceUri, "content-type", contentType, "string", m.group());
		}
	}
}

public class Test2 {
	public static void main(String[] args) throws Exception {
		//findAttributeValues();
		//findTagsAndMetadata();
		//findParseErrors();
		//findXMLPIs();
		//findDoctypes();
		findAbbreviations();
	}

	public static void runSurvey(String outputFilename, Iterable<String> urls, PageProcessor.Factory processorFactory)
			throws IOException, SAXException {

		MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
		HttpClient client = new HttpClient(connectionManager);
		client.getHttpConnectionManager().getParams().setConnectionTimeout(10000);
		client.getHttpConnectionManager().getParams().setSoTimeout(10000);
		int maxConnections = 128;
		client.getHttpConnectionManager().getParams().setMaxTotalConnections(maxConnections);
		
		CachedHttpClient cachedClient = new CachedHttpClient(client);

		Reporter reporter = new Reporter();
		reporter.beginOutput(outputFilename);

		BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
		ThreadPoolExecutor executor = new ThreadPoolExecutor(maxConnections + 16, maxConnections + 16, 5,
				TimeUnit.SECONDS, queue);

		for (String url : urls) {
			PageProcessor processor = processorFactory.create();
			processor.initialise(cachedClient, url, reporter);
			executor.execute(processor);
		}

		executor.shutdown();

		long time = System.currentTimeMillis();
		try {
			while (executor.awaitTermination(1, TimeUnit.SECONDS) == false) {
				System.out.println(queue.size() + " left; " + (System.currentTimeMillis() - time) / 1000 + " seconds");
			}
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		reporter.finishOutput();
	}

	public static void findAttributeValues() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-cc.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new PageRegexProcessor("<!-*\\[[^]]*\\]-*>", true);
			}
		});
	}

	public static void findDoctypes() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-doctype.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new PageRegexProcessor("(?xi) <!doctype [^>\\s]* ( ( \\s+ | \"[^\"]*\" | '[^']*' ) [^>\\s]* )* >", true);
			}
		});
	}

	public static void findXMLPIs() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-pis.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new PageRegexProcessor("<\\?[^>]*>", false);
			}
		});
	}

	public static void findTagsAndMetadata() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-tagsmeta.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new PageDownloader3();
			}
		});
	}


	public static void findParseErrors() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-errors.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new ParseErrorLister();
			}
		});
	}

	public static void findAbbreviations() throws IOException, SAXException {

		BufferedReader urlReader = new BufferedReader(new FileReader("../pages/dmoz-random-pages.txt"));
		List<String> urls = new ArrayList<String>();
		for (int i = 0; i < 1024*16; ++i)
			urls.add(urlReader.readLine());

		runSurvey("output-abbrs.xml", urls, new PageProcessor.Factory() {
			public PageProcessor create() {
				return new AbbreviationLister();
			}
		});
	}
}

