html - How to wrap part of a text in a node with JavaScript -
i have challenging problem solve. i'm working on script takes regex input. script finds matches regex in document , wraps each match in own <span> element. hard part text formatted html document, script needs navigate through dom , apply regex across multiple text nodes @ once, while figuring out has split text nodes if needed.
for example, regex captures full sentences starting capital letter , ending period, document:
<p> <b>html</b> language used make <b>websites.</b> developed <i>cern</i> employees in 90s. <p>
would turned this:
<p> <span><b>html</b> language used make <b>websites.</b></span> <span>it developed <i>cern</i> employees in 90s.</span> <p>
the script returns list of created spans.
i have code finds text nodes , stores them in list along position across whole document , depth. don't need understand code me , recursive structure can bit confusing. the first part i'm not sure how figure out elements should included within span.
function smartnode(node, depth, start) { this.node = node; this.depth = depth; this.start = start; } function findtextnodes(node, depth, start) { var list = []; var start = start || 0; depth = (typeof depth !== "undefined" ? depth : -1); if(node.nodetype === node.text_node) { list.push(new smartnode(node, depth, start)); } else { for(var i=0; < node.childnodes.length; ++i) { list = list.concat(findtextnodes(node.childnodes[i], depth+1, start)); if(list.length) start += list[list.length-1].node.nodevalue.length; } } return list; }
i figure i'll make string out of document, run regex through , use list find nodes correspond witch regex matches , split text nodes accordingly.
but issue arrives when have document this:
<p> program <a href="beta.html">not stable yet. not use in production yet.</a> </p>
there's sentence starts outside of <a>
tag ends inside it. don't want script split link in 2 tags. in more complex document, ruin page if did. code either wrap 2 sentences together:
<p> <span>this program <a href="beta.html">not stable yet. not use in production yet.</a></span> </p>
or wrap each part in own element:
<p> <span>this program </span> <a href="beta.html"> <span>not stable yet.</span> <span>do not use in production yet.</span> </a> </p>
there parameter specify should do. i'm not sure how figure out when impossible cut happen, , how recover it.
another issue comes when have whitespace inside child element this:
<p>this <b>sentence. </b></p>
technically, regex match end right after period, before end of <b>
tag. however, better consider space part of match , wrap this:
<p><span>this <b>sentence. </b></span></p>
than this:
<p><span>this </span><b><span>sentence.</span> </b></p>
but that's minor issue. after all, allow white-space included within regex.
i know might sound "do me" question , not kind of quick question see on over daily basis, i've been stuck on while , it's open-source library i'm working on. solving problem last obstacle. if think se site best suited question, redirect me please.
here 2 ways deal this.
i don't know if following exactly match needs. it's simple enough solution problem, @ least it doesn't use regex manipulate html tags. performs pattern matching against raw text , uses dom manipulate content.
first approach
this approach creates 1 <span>
tag per match, leveraging less common browser apis.
(see main problem of approach below demo, , if not sure, use second approach).
the range
class represents text fragment. has surroundcontents
function lets wrap range in element. except has caveat:
this method equivalent
newnode.appendchild(range.extractcontents()); range.insertnode(newnode)
. after surrounding, boundary points of range includenewnode
.an exception thrown, however, if
range
splits non-text
node 1 of boundary points. is, unlike alternative above, if there partially selected nodes, not cloned , instead operation fail.
well, workaround provided in mdn, all's good.
so here's algorithm:
- make list of
text
nodes , keep start indices in text - concatenate these nodes' values
text
find matches on text, , each match:
- find start , end nodes of match, comparing the nodes' start indices match position
- create
range
on match - let browser dirty work using trick above
- rebuild node list since last action changed dom
here's implementation demo:
function highlight(element, regex) { var document = element.ownerdocument; var getnodes = function() { var nodes = [], offset = 0, node, nodeiterator = document.createnodeiterator(element, nodefilter.show_text, null, false); while (node = nodeiterator.nextnode()) { nodes.push({ textnode: node, start: offset, length: node.nodevalue.length }); offset += node.nodevalue.length } return nodes; } var nodes = getnodes(nodes); if (!nodes.length) return; var text = ""; (var = 0; < nodes.length; ++i) text += nodes[i].textnode.nodevalue; var match; while (match = regex.exec(text)) { // prevent empty matches causing infinite loops if (!match[0].length) { regex.lastindex++; continue; } // find start , end text node var startnode = null, endnode = null; (i = 0; < nodes.length; ++i) { var node = nodes[i]; if (node.start + node.length <= match.index) continue; if (!startnode) startnode = node; if (node.start + node.length >= match.index + match[0].length) { endnode = node; break; } } var range = document.createrange(); range.setstart(startnode.textnode, match.index - startnode.start); range.setend(endnode.textnode, match.index + match[0].length - endnode.start); var spannode = document.createelement("span"); spannode.classname = "highlight"; spannode.appendchild(range.extractcontents()); range.insertnode(spannode); nodes = getnodes(); } } // test code var testdiv = document.getelementbyid("test-cases"); var originalhtml = testdiv.innerhtml; function test() { testdiv.innerhtml = originalhtml; try { var regex = new regexp(document.getelementbyid("regex").value, "g"); highlight(testdiv, regex); } catch(e) { testdiv.innertext = e; } } document.getelementbyid("runbtn").onclick = test; test();
.highlight { background-color: yellow; border: 1px solid orange; border-radius: 5px; } .section { border: 1px solid gray; padding: 10px; margin: 10px; }
<form class="section"> regex: <input id="regex" type="text" value="[a-z].*?\." /> <button id="runbtn">highlight</button> </form> <div id="test-cases" class="section"> <div>foo bar baz</div> <p> <b>html</b> language used make <b>websites.</b> it developed <i>cern</i> employees in 90s. <p> <p> program <a href="beta.html">not stable yet. not use in production yet.</a> </p> <div>foo bar baz</div> </div>
ok, lazy approach which, unfortunately doesn't work cases. works if only highlight across inline elements, breaks when there block elements along way because of following property of extractcontents
function:
partially selected nodes cloned include parent tags necessary make document fragment valid.
that's bad. it'll duplicate block-level nodes. try previous demo baz\s+html
regex if want see how breaks.
second approach
this approach iterates on matching nodes, creating <span>
tags along way.
the overall algorithm straightforward wraps each matching node in own <span>
. means have deal partially matching text nodes, requires more effort.
if text node matches partially, it's split splittext
function:
after split, current node contains content specified offset point, , newly created node of same type contains remaining text. newly created node returned caller.
function highlight(element, regex) { var document = element.ownerdocument; var nodes = [], text = "", node, nodeiterator = document.createnodeiterator(element, nodefilter.show_text, null, false); while (node = nodeiterator.nextnode()) { nodes.push({ textnode: node, start: text.length }); text += node.nodevalue } if (!nodes.length) return; var match; while (match = regex.exec(text)) { var matchlength = match[0].length; // prevent empty matches causing infinite loops if (!matchlength) { regex.lastindex++; continue; } (var = 0; < nodes.length; ++i) { node = nodes[i]; var nodelength = node.textnode.nodevalue.length; // skip nodes before match if (node.start + nodelength <= match.index) continue; // break after match if (node.start >= match.index + matchlength) break; // split start node if required if (node.start < match.index) { nodes.splice(i + 1, 0, { textnode: node.textnode.splittext(match.index - node.start), start: match.index }); continue; } // split end node if required if (node.start + nodelength > match.index + matchlength) { nodes.splice(i + 1, 0, { textnode: node.textnode.splittext(match.index + matchlength - node.start), start: match.index + matchlength }); } // highlight current node var spannode = document.createelement("span"); spannode.classname = "highlight"; node.textnode.parentnode.replacechild(spannode, node.textnode); spannode.appendchild(node.textnode); } } } // test code var testdiv = document.getelementbyid("test-cases"); var originalhtml = testdiv.innerhtml; function test() { testdiv.innerhtml = originalhtml; try { var regex = new regexp(document.getelementbyid("regex").value, "g"); highlight(testdiv, regex); } catch(e) { testdiv.innertext = e; } } document.getelementbyid("runbtn").onclick = test; test();
.highlight { background-color: yellow; } .section { border: 1px solid gray; padding: 10px; margin: 10px; }
<form class="section"> regex: <input id="regex" type="text" value="[a-z].*?\." /> <button id="runbtn">highlight</button> </form> <div id="test-cases" class="section"> <div>foo bar baz</div> <p> <b>html</b> language used make <b>websites.</b> it developed <i>cern</i> employees in 90s. <p> <p> program <a href="beta.html">not stable yet. not use in production yet.</a> </p> <div>foo bar baz</div> </div>
this should enough cases hope. if need minimize number of <span>
tags can done extending function, wanted keep simple now.
Comments
Post a Comment