mirror of
https://git.FreeBSD.org/ports.git
synced 2024-11-26 00:55:14 +00:00
Jericho HTML Parser is a simple but powerful java library allowing
analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions. WWW: http://jerichohtml.sourceforge.net/doc/index.html PR: ports/124770 Submitted by: Marcin Cieslak <saper at SYSTEM.PL>
This commit is contained in:
parent
48c8d2e494
commit
8d9582086c
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=217126
@ -279,6 +279,7 @@
|
||||
SUBDIR += jakarta-tomcat4
|
||||
SUBDIR += jakarta-tomcat5
|
||||
SUBDIR += jdresolve
|
||||
SUBDIR += jericho-html
|
||||
SUBDIR += jesred
|
||||
SUBDIR += jetspeed
|
||||
SUBDIR += jetty
|
||||
|
52
www/jericho-html/Makefile
Normal file
52
www/jericho-html/Makefile
Normal file
@ -0,0 +1,52 @@
|
||||
# New ports collection makefile for: jerichohtml
|
||||
# Date created: 2008-06-17
|
||||
# Whom: Marcin Cieslak <saper@SYSTEM.PL>
|
||||
#
|
||||
# $FreeBSD$
|
||||
#
|
||||
|
||||
PORTNAME= jericho-html
|
||||
PORTVERSION= 2.5
|
||||
CATEGORIES= www java
|
||||
MASTER_SITES= SF
|
||||
MASTER_SITE_SUBDIR= ${PORTNAME:S,-,,}
|
||||
|
||||
MAINTAINER= saper@SYSTEM.PL
|
||||
COMMENT= A java library to analyse and manipulate HTML
|
||||
|
||||
USE_ZIP= yes
|
||||
USE_JAVA= 1.3+
|
||||
|
||||
INTERFACES:= "compile-time-dependencies/slf4j-api-1.4.3.jar:\
|
||||
compile-time-dependencies/commons-logging-api-1.1.jar:\
|
||||
compile-time-dependencies/log4j-api-1.2.14.jar"
|
||||
|
||||
PORTDOCS= api
|
||||
PLIST_FILES+= %%JAVAJARDIR%%/${PORTNAME}.jar
|
||||
|
||||
do-build:
|
||||
(cd ${WRKSRC} && ${RM} -rf classes/* && ${JAVAC} \
|
||||
-classpath ${INTERFACES:S, ,,g} \
|
||||
-d classes src/java/au/id/jericho/lib/html/*.java \
|
||||
src/java/au/id/jericho/lib/html/nodoc/*.java)
|
||||
${JAR} -cf ${WRKSRC}/lib/${PORTNAME}.jar \
|
||||
-C ${WRKSRC}/classes .
|
||||
.if !defined(NOPORTDOCS)
|
||||
(cd ${WRKSRC} && ${RM} -rf doc/* && ${JAVADOC} -quiet \
|
||||
-windowtitle "Jericho HTML Parser ${PORTVERSION}" \
|
||||
-classpath ${INTERFACES:S, ,,g}:src/java:classes \
|
||||
-use -d ${WRKSRC}/doc/api \
|
||||
-subpackages au.id.jericho.lib.html \
|
||||
-exclude au.id.jericho.lib.html.nodoc \
|
||||
-noqualifier au.id.jericho.lib.html \
|
||||
-group "Core package" au.id.jericho.lib.html)
|
||||
.endif
|
||||
|
||||
do-install:
|
||||
${INSTALL_DATA} ${WRKSRC}/lib/${PORTNAME}.jar ${JAVAJARDIR}
|
||||
.if !defined(NOPORTDOCS)
|
||||
${MKDIR} ${DOCSDIR}
|
||||
(cd ${WRKSRC}/doc && ${FIND} api | ${CPIO} -pdmu ${DOCSDIR})
|
||||
.endif
|
||||
|
||||
.include <bsd.port.mk>
|
3
www/jericho-html/distinfo
Normal file
3
www/jericho-html/distinfo
Normal file
@ -0,0 +1,3 @@
|
||||
MD5 (jericho-html-2.5.zip) = 64306d0eb82608e50496a680b319182d
|
||||
SHA256 (jericho-html-2.5.zip) = 212b9e8b72f9787dfafd046e8716f0d04365afcd3f4d2fb293e69d5b90e456b4
|
||||
SIZE (jericho-html-2.5.zip) = 1456664
|
15
www/jericho-html/files/patch-encoding
Normal file
15
www/jericho-html/files/patch-encoding
Normal file
@ -0,0 +1,15 @@
|
||||
--- src/java/au/id/jericho/lib/html/StreamEncodingDetector.java.orig 2008-06-17 21:01:53.890292905 +0200
|
||||
+++ src/java/au/id/jericho/lib/html/StreamEncodingDetector.java 2008-06-17 21:02:43.940300330 +0200
|
||||
@@ -203,9 +203,9 @@
|
||||
// Assume the more likely case of four 8-bit characters <= U+00FF.
|
||||
// Check whether it fits some common EBCDIC strings that might be found at the start of a document:
|
||||
if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
|
||||
- if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC ("Lo§”" in Windows-1252)
|
||||
- if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
|
||||
- if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("Lˆ£”" in Windows-1252)
|
||||
+ if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC
|
||||
+ if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC
|
||||
+ if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" or "<htm"
|
||||
// although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
|
||||
}
|
||||
// Now confident that it is not EBCDIC, but some other 8-bit encoding.
|
8
www/jericho-html/pkg-descr
Normal file
8
www/jericho-html/pkg-descr
Normal file
@ -0,0 +1,8 @@
|
||||
Jericho HTML Parser is a simple but powerful java library allowing
|
||||
analysis and manipulation of parts of an HTML document, including
|
||||
some common server-side tags, while reproducing verbatim any
|
||||
unrecognised or invalid HTML.
|
||||
|
||||
It also provides high-level HTML form manipulation functions.
|
||||
|
||||
WWW: http://jerichohtml.sourceforge.net/doc/index.html
|
Loading…
Reference in New Issue
Block a user