Html Parser For Gas Min
function HTMLparser(html_str) { if(!(this instanceof HTMLparser)) { return new HTMLparser(html_str); }else { this.html = []; this.attr = {}; this.tags = {}; var html_obj = {tree: []}; function per(str, ary, parent, self) { var obj = {tagname: "_Node", attr: null, innerText: "", tree: [], parent: null}; obj.parent = parent; var matchTag = str.match(/<([a-zA-Z][^\t\n\r\f \/>\x00]*?)(| [a-zA-Z][^\t\n\r\f>\x00]*?[^\/])>([\s\S]*?)$/); //[0]: 全体, [1]: タグ名, [2]: 属性, [3]: その開始タグ以降のテキスト function sameTagBothReg(tagname) { return new RegExp("(<" + tagname + ">|<" + tagname + " [a-zA-Z][^\t\n\r\f>\x00]*?[^\/]>|<\\/" + tagname + ">)"); } function sameTagStartReg(tagname) { return new RegExp("(<" + tagname + ">|<" + tagname + " [a-zA-Z][^\t\n\r\f>\x00]*?[^\/]>)"); } function sameTagEndReg(tagname, count) { return new RegExp("(<\\/" + tagname + ">)"); } var attrReg = /([a-zA-Z][^\t\n\r\f >\x00]*=\".*?\")/g; var attrReg_g = /([a-zA-Z][^\t\n\r\f >\x00]*)=\"(.*?)\"/g; if(matchTag) { var attr_obj = {}; var attr_node_list = matchTag[2].split(attrReg).filter(function(r) {return r.match(attrReg);}) attr_node_list.forEach(function(r) { var a = r.split(attrReg_g); if(!self.attr[a[1]]) { self.attr[a[1]] = {}; } var v; if(a[1] == "class") { v = a[2].split(" "); v.forEach(function(r) { if(!self.attr[a[1]][r]) { self.attr[a[1]][r] = []; } self.attr[a[1]][r].push(obj); }); }else { v = a[2]; if(!self.attr[a[1]][v]) { self.attr[a[1]][v] = []; } self.attr[a[1]][v].push(obj); } attr_obj[a[1]] = v; }); obj.attr = attr_obj; obj.tagname = matchTag[1]; var st_cnt = 1; var ed_cnt = 0; var sp_idx = 0; var splitted_same_tag = matchTag[3].split(sameTagBothReg(matchTag[1])); splitted_same_tag.forEach(function(v, i) { if(sp_idx) { return; } else if(v.match(sameTagStartReg(matchTag[1]))) { st_cnt++; return; } else if(v.match(sameTagEndReg(matchTag[1]))) { ed_cnt++; if(st_cnt == ed_cnt) { sp_idx = i; }else { return; } } }); var child = splitted_same_tag.slice(0, sp_idx).join(""); if(matchTag[1] == "title") { self.title = child; } if(matchTag[1] !== "script") { per(child, obj.tree, obj, self); } var bro = splitted_same_tag.slice(sp_idx + 1).join(""); if(bro !== "") { per(bro, ary, parent, self); } }else { obj.tree = [str]; } ary.unshift(obj); if(obj.tagname == "_Node") { Text(obj, obj.tree.join("")); function Text(o, txt) { o.innerText += txt; if(o.parent) { Text(o.parent, txt); } } }else { if(!self.tags[obj.tagname]) { self.tags[obj.tagname] = []; } self.tags[obj.tagname].push(obj); } } per(html_str.replace(/<!--[\s\S]*?-->/g, ""), this.html, null, this); } } /* var Html = HTMLparser(document.body.innerText); Html */
HTMLparser.prototype = { search: function(selector) { var sel = sel_parse(selector); return check_tree(sel[0], this);//1要素まで function check_tree(s, self) { var r = []; var _tag = s.tag; if(_tag) { if(self.tags[_tag]) { for(var t of self.tags[_tag]) { var _atr = t.attr; if(s.class.length) { if(_atr.class) { var _chk = false; for(var cls of s.class) { if(_atr.class.indexOf(cls) < 0) { _chk = true; break; } } if(_chk) {continue;} }else { continue; } } if(s.id.length) { if(s.id[0] !== _atr.id) { continue; } } r.push(t); } } }else { var c_ary = []; var i_ary = []; var a_ary = []; if(s.class.length) { var _chk = true; var clsList = Object.keys(self.attr.class); for(var cls of s.class) { if(clsList.indexOf(cls) < 0) { break; }else { if(!c_ary.length) { c_ary = self.attr.class[cls]; }else { c_ary = c_ary.concat(self.attr.class[cls]).filter(function(x, i, self) { return self.indexOf(x) === i && i !== self.lastIndexOf(x); }); } } } } if(s.id.length) { var idList = Object.keys(self.attr.id); if(-1 < idList.indexOf(s.id[0])) { i_ary = self.attr.id[s.id[0]]; } } if(s.attr.length) { var _chk = false; for(var a of s.attr) { var _k = a.atr; var _m = false; if(_k.match(/(\*|\^|\$)$/)) { _m = _k.match(/(\*|\^|\$)$/)[1]; _k = _k.replace(/(\*|\^|\$)$/, ""); } } } var full_cnt = 0; if(c_ary.length) { full_cnt++; r = r.concat(c_ary); } if(i_ary.length) { full_cnt++; r = r.concat(i_ary); } if(a_ary.length) { full_cnt++; r = r.concat(a_ary); } if(1 < full_cnt) { r = r.filter(function(x, i, self) { return self.indexOf(x) === i && i !== self.lastIndexOf(x); }); } } return r; } function sel_parse(sel) { if(sel.match(/ ?[\+\~] ?/g)) { throw Error('You cannot use Adjacent sibling combinator "+/~".'); } else if(sel.match(/\:(nth-child\(|nth-of-type\(|not\(|first-child|first-of-type|last-child|last-of-type)/g)) { throw Error('You cannot use Pseudo-elements like ":nth-of-type()"'); } var sp = sel.split(/( ?> ?|(?<=[a-zA-Z0-9\]\_\-]) (?=[a-zA-Z\[\.\#\_]))/g); var a = [], nxt = false; sp.forEach(function(s, idx) { if(s.match(/^ $/g)) { return; } else if(s.match(/^ ?> ?$/g)) { nxt = true; return; } var _o = {"tag": null, "class": [], "id": [], "attr": [], "next": false}; if(nxt) { _o.next = true; nxt = false; } var _s = s.split(/(\[.*?\]|(?<=(?:[a-zA-Z\]]|^))(?:\.|\#)[a-zA-Z\_][a-zA-Z0-9\_\-]*)/g).filter(function(r) {return r;}); _s.forEach(function(p) { if(p.match(/^\#/)) { _o.id.push(p.replace(/\#/, "")); } else if(p.match(/^\./)) { _o.class.push(p.replace(/\./, "")); } else if(p.match(/^[a-zA-Z]/)) { _o.tag = p; } else if(p.match(/^\[(.*?)(?:\=(?:\"(.*?)\"|\'(.*?)\')|)\]/)) { var _m = p.match(/^\[(.*?)(?:\=(?:\"(.*?)\"|\'(.*?)\')|)\]/); _o.attr.push({"atr": _m[1], "que": _m[2] ? _m[2] : ""}); } }); a.unshift(_o); }); return a; } } }
var Html = HTMLparser(document.body.outerHTML); Html.search("span");
page revision: 3, last edited: 31 Mar 2020 11:30