//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] //[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] //[5] Name ::= NameStartChar (NameChar)* var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\\u00B7\\u0300-\\u036F\\u203F-\\u2040]"); var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$'); //var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE var S_TAG = 0;//tag name offerring var S_ATTR = 1;//attr name offerring var S_ATTR_SPACE=2;//attr name end and space offer var S_EQ = 3;//=space? var S_ATTR_NOQUOT_VALUE = 4;//attr value(no quot value only) var S_ATTR_END = 5;//attr value end and no space(quot end) var S_TAG_SPACE = 6;//(attr value end || tag end ) && (space offer) var S_TAG_CLOSE = 7;//closed el<el /> function XMLReader(){ } XMLReader.prototype = { parse:function(source,defaultNSMap,entityMap){ var domBuilder = this.domBuilder; domBuilder.startDocument(); _copy(defaultNSMap ,defaultNSMap = {}) parse(source,defaultNSMap,entityMap, domBuilder,this.errorHandler); domBuilder.endDocument(); } } function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){ function fixedFromCharCode(code) { // String.prototype.fromCharCode does not supports // > 2 bytes unicode chars directly if (code > 0xffff) { code -= 0x10000; var surrogate1 = 0xd800 + (code >> 10) , surrogate2 = 0xdc00 + (code & 0x3ff); return String.fromCharCode(surrogate1, surrogate2); } else { return String.fromCharCode(code); } } function entityReplacer(a){ var k = a.slice(1,-1); if(k in entityMap){ return entityMap[k]; }else if(k.charAt(0) === '#'){ return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x'))) }else{ errorHandler.error('entity not found:'+a); return a; } } function appendText(end){//has some bugs if(end>start){ var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer); locator&&position(start); domBuilder.characters(xt,0,end-start); start = end } } function position(p,m){ while(p>=lineEnd && (m = linePattern.exec(source))){ lineStart = m.index; lineEnd = lineStart + m[0].length; locator.lineNumber++; //console.log('line++:',locator,startPos,endPos) } locator.columnNumber = p-lineStart+1; } var lineStart = 0; var lineEnd = 0; var linePattern = /.*(?:\r\n?|\n)|.*$/g var locator = domBuilder.locator; var parseStack = [{currentNSMap:defaultNSMapCopy}] var closeMap = {}; var start = 0; while(true){ try{ var tagStart = source.indexOf('<',start); if(tagStart<0){ if(!source.substr(start).match(/^\s*$/)){ var doc = domBuilder.doc; var text = doc.createTextNode(source.substr(start)); doc.appendChild(text); domBuilder.currentElement = text; } return; } if(tagStart>start){ appendText(tagStart); } switch(source.charAt(tagStart+1)){ case '/': var end = source.indexOf('>',tagStart+3); var tagName = source.substring(tagStart+2,end); var config = parseStack.pop(); if(end<0){ tagName = source.substring(tagStart+2).replace(/[\s<].*/,''); //console.error('#@@@@@@'+tagName) errorHandler.error("end tag name: "+tagName+' is not complete:'+config.tagName); end = tagStart+1+tagName.length; }else if(tagName.match(/\s</)){ tagName = tagName.replace(/[\s<].*/,''); errorHandler.error("end tag name: "+tagName+' maybe not complete'); end = tagStart+1+tagName.length; } //console.error(parseStack.length,parseStack) //console.error(config); var localNSMap = config.localNSMap; var endMatch = config.tagName == tagName; var endIgnoreCaseMach = endMatch || config.tagName&&config.tagName.toLowerCase() == tagName.toLowerCase() if(endIgnoreCaseMach){ domBuilder.endElement(config.uri,config.localName,tagName); if(localNSMap){ for(var prefix in localNSMap){ domBuilder.endPrefixMapping(prefix) ; } } if(!endMatch){ errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName ); } }else{ parseStack.push(config) } end++; break; // end elment case '?':// <?...?> locator&&position(tagStart); end = parseInstruction(source,tagStart,domBuilder); break; case '!':// <!doctype,<![CDATA,<!-- locator&&position(tagStart); end = parseDCC(source,tagStart,domBuilder,errorHandler); break; default: locator&&position(tagStart); var el = new ElementAttributes(); var currentNSMap = parseStack[parseStack.length-1].currentNSMap; //elStartEnd var end = parseElementStartPart(source,tagStart,el,currentNSMap,entityReplacer,errorHandler); var len = el.length; if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){ el.closed = true; if(!entityMap.nbsp){ errorHandler.warning('unclosed xml attribute'); } } if(locator && len){ var locator2 = copyLocator(locator,{}); //try{//attribute position fixed for(var i = 0;i<len;i++){ var a = el[i]; position(a.offset); a.locator = copyLocator(locator,{}); } //}catch(e){console.error('@@@@@'+e)} domBuilder.locator = locator2 if(appendElement(el,domBuilder,currentNSMap)){ parseStack.push(el) } domBuilder.locator = locator; }else{ if(appendElement(el,domBuilder,currentNSMap)){ parseStack.push(el) } } if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){ end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder) }else{ end++; } } }catch(e){ errorHandler.error('element parse error: '+e) //errorHandler.error('element parse error: '+e); end = -1; //throw e; } if(end>start){ start = end; }else{ //TODO: 这里有可能sax回退,有位置错误风险 appendText(Math.max(tagStart,start)+1); } } } function copyLocator(f,t){ t.lineNumber = f.lineNumber; t.columnNumber = f.columnNumber; return t; } /** * @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack); * @return end of the elementStartPart(end of elementEndPart for selfClosed el) */ function parseElementStartPart(source,start,el,currentNSMap,entityReplacer,errorHandler){ var attrName; var value; var p = ++start; var s = S_TAG;//status while(true){ var c = source.charAt(p); switch(c){ case '=': if(s === S_ATTR){//attrName attrName = source.slice(start,p); s = S_EQ; }else if(s === S_ATTR_SPACE){ s = S_EQ; }else{ //fatalError: equal must after attrName or space after attrName throw new Error('attribute equal must after attrName'); } break; case '\'': case '"': if(s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE ){//equal if(s === S_ATTR){ errorHandler.warning('attribute value must after "="') attrName = source.slice(start,p) } start = p+1; p = source.indexOf(c,start) if(p>0){ value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); el.add(attrName,value,start-1); s = S_ATTR_END; }else{ //fatalError: no end quot match throw new Error('attribute value no end \''+c+'\' match'); } }else if(s == S_ATTR_NOQUOT_VALUE){ value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); //console.log(attrName,value,start,p) el.add(attrName,value,start); //console.dir(el) errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!'); start = p+1; s = S_ATTR_END }else{ //fatalError: no equal before throw new Error('attribute value must after "="'); } break; case '/': switch(s){ case S_TAG: el.setTagName(source.slice(start,p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: s =S_TAG_CLOSE; el.closed = true; case S_ATTR_NOQUOT_VALUE: case S_ATTR: case S_ATTR_SPACE: break; //case S_EQ: default: throw new Error("attribute invalid close char('/')") } break; case ''://end document //throw new Error('unexpected end of input') errorHandler.error('unexpected end of input'); if(s == S_TAG){ el.setTagName(source.slice(start,p)); } return p; case '>': switch(s){ case S_TAG: el.setTagName(source.slice(start,p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: break;//normal case S_ATTR_NOQUOT_VALUE://Compatible state case S_ATTR: value = source.slice(start,p); if(value.slice(-1) === '/'){ el.closed = true; value = value.slice(0,-1) } case S_ATTR_SPACE: if(s === S_ATTR_SPACE){ value = attrName; } if(s == S_ATTR_NOQUOT_VALUE){ errorHandler.warning('attribute "'+value+'" missed quot(")!!'); el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start) }else{ if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)){ errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!') } el.add(value,value,start) } break; case S_EQ: throw new Error('attribute value missed!!'); } // console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) return p; /*xml space '\x20' | #x9 | #xD | #xA; */ case '\u0080': c = ' '; default: if(c<= ' '){//space switch(s){ case S_TAG: el.setTagName(source.slice(start,p));//tagName s = S_TAG_SPACE; break; case S_ATTR: attrName = source.slice(start,p) s = S_ATTR_SPACE; break; case S_ATTR_NOQUOT_VALUE: var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); errorHandler.warning('attribute "'+value+'" missed quot(")!!'); el.add(attrName,value,start) case S_ATTR_END: s = S_TAG_SPACE; break; //case S_TAG_SPACE: //case S_EQ: //case S_ATTR_SPACE: // void();break; //case S_TAG_CLOSE: //ignore warning } }else{//not space //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE switch(s){ //case S_TAG:void();break; //case S_ATTR:void();break; //case S_ATTR_NOQUOT_VALUE:void();break; case S_ATTR_SPACE: var tagName = el.tagName; if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)){ errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead2!!') } el.add(attrName,attrName,start); start = p; s = S_ATTR; break; case S_ATTR_END: errorHandler.warning('attribute space is required"'+attrName+'"!!') case S_TAG_SPACE: s = S_ATTR; start = p; break; case S_EQ: s = S_ATTR_NOQUOT_VALUE; start = p; break; case S_TAG_CLOSE: throw new Error("elements closed character '/' and '>' must be connected to"); } } }//end outer switch //console.log('p++',p) p++; } } /** * @return true if has new namespace define */ function appendElement(el,domBuilder,currentNSMap){ var tagName = el.tagName; var localNSMap = null; //var currentNSMap = parseStack[parseStack.length-1].currentNSMap; var i = el.length; while(i--){ var a = el[i]; var qName = a.qName; var value = a.value; var nsp = qName.indexOf(':'); if(nsp>0){ var prefix = a.prefix = qName.slice(0,nsp); var localName = qName.slice(nsp+1); var nsPrefix = prefix === 'xmlns' && localName }else{ localName = qName; prefix = null nsPrefix = qName === 'xmlns' && '' } //can not set prefix,because prefix !== '' a.localName = localName ; //prefix == null for no ns prefix attribute if(nsPrefix !== false){//hack!! if(localNSMap == null){ localNSMap = {} //console.log(currentNSMap,0) _copy(currentNSMap,currentNSMap={}) //console.log(currentNSMap,1) } currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; a.uri = 'http://www.w3.org/2000/xmlns/' domBuilder.startPrefixMapping(nsPrefix, value) } } var i = el.length; while(i--){ a = el[i]; var prefix = a.prefix; if(prefix){//no prefix attribute has no namespace if(prefix === 'xml'){ a.uri = 'http://www.w3.org/XML/1998/namespace'; }if(prefix !== 'xmlns'){ a.uri = currentNSMap[prefix || ''] //{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)} } } } var nsp = tagName.indexOf(':'); if(nsp>0){ prefix = el.prefix = tagName.slice(0,nsp); localName = el.localName = tagName.slice(nsp+1); }else{ prefix = null;//important!! localName = el.localName = tagName; } //no prefix element has default namespace var ns = el.uri = currentNSMap[prefix || '']; domBuilder.startElement(ns,localName,tagName,el); //endPrefixMapping and startPrefixMapping have not any help for dom builder //localNSMap = null if(el.closed){ domBuilder.endElement(ns,localName,tagName); if(localNSMap){ for(prefix in localNSMap){ domBuilder.endPrefixMapping(prefix) } } }else{ el.currentNSMap = currentNSMap; el.localNSMap = localNSMap; //parseStack.push(el); return true; } } function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){ if(/^(?:script|textarea)$/i.test(tagName)){ var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd); var text = source.substring(elStartEnd+1,elEndStart); if(/[&<]/.test(text)){ if(/^script$/i.test(tagName)){ //if(!/\]\]>/.test(text)){ //lexHandler.startCDATA(); domBuilder.characters(text,0,text.length); //lexHandler.endCDATA(); return elEndStart; //} }//}else{//text area text = text.replace(/&#?\w+;/g,entityReplacer); domBuilder.characters(text,0,text.length); return elEndStart; //} } } return elStartEnd+1; } function fixSelfClosed(source,elStartEnd,tagName,closeMap){ //if(tagName in closeMap){ var pos = closeMap[tagName]; if(pos == null){ //console.log(tagName) pos = source.lastIndexOf('</'+tagName+'>') if(pos<elStartEnd){//忘记闭合 pos = source.lastIndexOf('</'+tagName) } closeMap[tagName] =pos } return pos<elStartEnd; //} } function _copy(source,target){ for(var n in source){target[n] = source[n]} } function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!' var next= source.charAt(start+2) switch(next){ case '-': if(source.charAt(start + 3) === '-'){ var end = source.indexOf('-->',start+4); //append comment source.substring(4,end)//<!-- if(end>start){ domBuilder.comment(source,start+4,end-start-4); return end+3; }else{ errorHandler.error("Unclosed comment"); return -1; } }else{ //error return -1; } default: if(source.substr(start+3,6) == 'CDATA['){ var end = source.indexOf(']]>',start+9); domBuilder.startCDATA(); domBuilder.characters(source,start+9,end-start-9); domBuilder.endCDATA() return end+3; } //<!DOCTYPE //startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) var matchs = split(source,start); var len = matchs.length; if(len>1 && /!doctype/i.test(matchs[0][0])){ var name = matchs[1][0]; var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0] var sysid = len>4 && matchs[4][0]; var lastMatch = matchs[len-1] domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'), sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2')); domBuilder.endDTD(); return lastMatch.index+lastMatch[0].length } } return -1; } function parseInstruction(source,start,domBuilder){ var end = source.indexOf('?>',start); if(end){ var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); if(match){ var len = match[0].length; domBuilder.processingInstruction(match[1], match[2]) ; return end+2; }else{//error return -1; } } return -1; } /** * @param source */ function ElementAttributes(source){ } ElementAttributes.prototype = { setTagName:function(tagName){ if(!tagNamePattern.test(tagName)){ throw new Error('invalid tagName:'+tagName) } this.tagName = tagName }, add:function(qName,value,offset){ if(!tagNamePattern.test(qName)){ throw new Error('invalid attribute:'+qName) } this[this.length++] = {qName:qName,value:value,offset:offset} }, length:0, getLocalName:function(i){return this[i].localName}, getLocator:function(i){return this[i].locator}, getQName:function(i){return this[i].qName}, getURI:function(i){return this[i].uri}, getValue:function(i){return this[i].value} // ,getIndex:function(uri, localName)){ // if(localName){ // // }else{ // var qName = uri // } // }, // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, // getType:function(uri,localName){} // getType:function(i){}, } function _set_proto_(thiz,parent){ thiz.__proto__ = parent; return thiz; } if(!(_set_proto_({},_set_proto_.prototype) instanceof _set_proto_)){ _set_proto_ = function(thiz,parent){ function p(){}; p.prototype = parent; p = new p(); for(parent in thiz){ p[parent] = thiz[parent]; } return p; } } function split(source,start){ var match; var buf = []; var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g; reg.lastIndex = start; reg.exec(source);//skip < while(match = reg.exec(source)){ buf.push(match); if(match[1])return buf; } } exports.XMLReader = XMLReader;