You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_transformers/xsl/html_chunks_with_headers.xslt

200 lines
5.9 KiB
XML

<?xml version="1.0" encoding="UTF-8" ?>
<!-- HTML PRE CHUNK:
This performs a best-effort preliminary "chunking" of text in an HTML file,
matching each chunk with a "headers" metadata value based on header tags in proximity.
recursively visits every element (template mode=list).
for every element with tagname of interest (only):
1. serializes a div (and metadata marking the element's xpath).
2. calculates all text-content for the given element, including descendant elements which are *not* themselves tags of interest.
3. if any such text-content was found, serializes a "headers" (span.headers) along with this text (span.chunk).
to calculate the "headers" of an element:
1. recursively gets the *nearest* prior-siblings for headings of *each* level
2. recursively repeats that step#1 for each ancestor (regardless of tag)
n.b. this recursion is only performed (beginning with) elements which are
both (1) tags-of-interest and (2) have their own text-content.
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns="http://www.w3.org/1999/xhtml">
<xsl:param name="tags">div|p|blockquote|ol|ul</xsl:param>
<xsl:template match="/">
<html>
<head>
<style>
div {
border: solid;
margin-top: .5em;
padding-left: .5em;
}
h1, h2, h3, h4, h5, h6 {
margin: 0;
}
.xpath {
color: blue;
}
.chunk {
margin: .5em 1em;
}
</style>
</head>
<body>
<!-- create "filtered tree" with only tags of interest -->
<xsl:apply-templates select="*" />
</body>
</html>
</xsl:template>
<xsl:template match="*">
<xsl:choose>
<!-- tags of interest get serialized into the filtered tree (and recurse down child elements) -->
<xsl:when test="contains(
concat('|', $tags, '|'),
concat('|', local-name(), '|'))">
<xsl:variable name="xpath">
<xsl:apply-templates mode="xpath" select="." />
</xsl:variable>
<xsl:variable name="txt">
<!-- recurse down child text-nodes and elements -->
<xsl:apply-templates mode="text" />
</xsl:variable>
<xsl:variable name="txt-norm" select="normalize-space($txt)" />
<div title="{$xpath}">
<small class="xpath">
<xsl:value-of select="$xpath" />
</small>
<xsl:if test="$txt-norm">
<xsl:variable name="headers">
<xsl:apply-templates mode="headingsWithAncestors" select="." />
</xsl:variable>
<xsl:if test="normalize-space($headers)">
<span class="headers">
<xsl:copy-of select="$headers" />
</span>
</xsl:if>
<p class="chunk">
<xsl:value-of select="$txt-norm" />
</p>
</xsl:if>
<xsl:apply-templates select="*" />
</div>
</xsl:when>
<!-- all other tags get "skipped" and recurse down child elements -->
<xsl:otherwise>
<xsl:apply-templates select="*" />
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- text mode:
prints text nodes;
for elements, recurses down child nodes (text and elements) *except* certain exceptions:
tags of interest (handled in their own list-mode match),
non-content text (e.g. script|style)
-->
<!-- ignore non-content text -->
<xsl:template mode="text" match="
script|style" />
<!-- for all other elements *except tags of interest*, recurse on child-nodes (text and elements) -->
<xsl:template mode="text" match="*">
<xsl:choose>
<!-- ignore tags of interest -->
<xsl:when test="contains(
concat('|', $tags, '|'),
concat('|', local-name(), '|'))" />
<xsl:otherwise>
<xsl:apply-templates mode="text" />
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- xpath mode:
return an xpath which matches this element uniquely
-->
<xsl:template mode="xpath" match="*">
<!-- recurse up parents -->
<xsl:apply-templates mode="xpath" select="parent::*" />
<xsl:value-of select="name()" />
<xsl:text>[</xsl:text>
<xsl:value-of select="1+count(preceding-sibling::*)" />
<xsl:text>]/</xsl:text>
</xsl:template>
<!-- headingsWithAncestors mode:
recurses up parents (ALL ancestors)
-->
<xsl:template mode="headingsWithAncestors" match="*">
<!-- recurse -->
<xsl:apply-templates mode="headingsWithAncestors" select="parent::*" />
<xsl:apply-templates mode="headingsWithPriorSiblings" select=".">
<xsl:with-param name="maxHead" select="6" />
</xsl:apply-templates>
</xsl:template>
<!-- headingsWithPriorSiblings mode:
recurses up preceding-siblings
-->
<xsl:template mode="headingsWithPriorSiblings" match="*">
<xsl:param name="maxHead" />
<xsl:variable name="headLevel" select="number(substring(local-name(), 2))" />
<xsl:choose>
<xsl:when test="'h' = substring(local-name(), 1, 1) and $maxHead >= $headLevel">
<!-- recurse up to prior sibling; max level one less than current -->
<xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]">
<xsl:with-param name="maxHead" select="$headLevel - 1" />
</xsl:apply-templates>
<xsl:apply-templates mode="heading" select="." />
</xsl:when>
<!-- special case for 'header' tag, serialize child-headers -->
<xsl:when test="self::header">
<xsl:apply-templates mode="heading" select="h1|h2|h3|h4|h5|h6" />
<!--
we choose not to recurse further up prior-siblings in this case,
but n.b. the 'headingsWithAncestors' template above will still continue recursion.
-->
</xsl:when>
<xsl:otherwise>
<!-- recurse up to prior sibling; no other work on this element -->
<xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]">
<xsl:with-param name="maxHead" select="$maxHead" />
</xsl:apply-templates>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template mode="heading" match="h1|h2|h3|h4|h5|h6">
<xsl:copy>
<xsl:value-of select="normalize-space(.)" />
</xsl:copy>
</xsl:template>
</xsl:stylesheet>