티스토리 수익 글 보기
Skip to content
Navigation Menu
{{ message }}
-
-
Notifications
You must be signed in to change notification settings - Fork 610
Expand file tree
/
Copy pathetree.pyx
More file actions
3863 lines (3245 loc) · 135 KB
/
etree.pyx
File metadata and controls
3863 lines (3245 loc) · 135 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# cython: binding=True
# cython: auto_pickle=False
# cython: language_level=3
"""
The “lxml.etree“ module implements the extended ElementTree API for XML.
"""
__docformat__ = "restructuredtext en"
__all__ = [
'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA',
'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
'FallbackElementClassLookup', 'FunctionNamespace', 'HTML', 'HTMLParser',
'ICONV_COMPILED_VERSION',
'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
'LIBXML_FEATURES',
'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION',
'LXML_VERSION',
'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
'XSLTSaveError', 'canonicalize',
'cleanup_namespaces', 'clear_error_log', 'dump',
'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
'use_global_python_log'
]
cimport cython
from lxml cimport python
from lxml.includes cimport tree, config
from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr
from lxml.python cimport _cstr, _isString
from lxml.includes cimport xpath
from lxml.includes cimport c14n
# Cython's standard declarations
cimport cpython.mem
cimport cpython.ref
from cpython.buffer cimport PyBUF_SIMPLE, PyBUF_READ, PyBUF_FORMAT, PyBUF_ND, PyBUF_STRIDES
from libc cimport limits, stdio, stdlib, stdint
from libc cimport string as cstring_h # not to be confused with stdlib 'string'
from libc.string cimport const_char
cdef object os_path_abspath
from os.path import abspath as os_path_abspath
cdef object BytesIO, StringIO, BufferedWriter
from io import BytesIO, StringIO, BufferedWriter
cdef object OrderedDict
from collections import OrderedDict
cdef object _elementpath
from lxml import _elementpath
cdef object sys
import sys
cdef object re
import re
cdef object partial
from functools import partial
cdef object islice
from itertools import islice
cdef object ITER_EMPTY = iter(())
cdef object MutableMapping
from collections.abc import MutableMapping
class _ImmutableMapping(MutableMapping):
def __getitem__(self, key):
raise KeyError, key
def __setitem__(self, key, value):
raise KeyError, key
def __delitem__(self, key):
raise KeyError, key
def __contains__(self, key):
return False
def __len__(self):
return 0
def __iter__(self):
return ITER_EMPTY
iterkeys = itervalues = iteritems = __iter__
cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping()
del _ImmutableMapping
# the rules
# ———
# any libxml C argument/variable is prefixed with c_
# any non-public function/class is prefixed with an underscore
# instance creation is always through factories
# what to do with libxml2/libxslt error messages?
# 0 : drop
# 1 : use log
DEF __DEBUG = 1
# maximum number of lines in the libxml2/xslt log if __DEBUG == 1
DEF __MAX_LOG_SIZE = 100
# make the compiled-in debug state publicly available
DEBUG = __DEBUG
# A struct to store a cached qualified tag name+href pair.
# While we can borrow the c_name from the document dict,
# PyPy requires us to store a Python reference for the
# namespace in order to keep the byte buffer alive.
cdef struct qname:
const_xmlChar* c_name
python.PyObject* href
# initialize parser (and threading)
xmlparser.xmlInitParser()
# global per-thread setup
tree.xmlThrDefIndentTreeOutput(1)
tree.xmlThrDefLineNumbersDefaultValue(1)
_initThreadLogging()
# filename encoding
cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")
cdef const char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
# set up some default namespace prefixes
cdef dict _DEFAULT_NAMESPACE_PREFIXES = {
b"http://www.w3.org/XML/1998/namespace": b'xml',
b"http://www.w3.org/1999/xhtml": b"html",
b"http://www.w3.org/1999/XSL/Transform": b"xsl",
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
# xml schema
b"http://www.w3.org/2001/XMLSchema": b"xs",
b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
# dublin core
b"http://purl.org/dc/elements/1.1/": b"dc",
# objectify
b"http://codespeak.net/lxml/objectify/pytype" : b"py",
}
# To avoid runtime encoding overhead, we keep a Unicode copy
# of the uri-prefix mapping as (str, str) items view.
cdef object _DEFAULT_NAMESPACE_PREFIXES_ITEMS = []
cdef _update_default_namespace_prefixes_items():
cdef bytes ns, prefix
global _DEFAULT_NAMESPACE_PREFIXES_ITEMS
_DEFAULT_NAMESPACE_PREFIXES_ITEMS = {
ns.decode('utf-8') : prefix.decode('utf-8')
for ns, prefix in _DEFAULT_NAMESPACE_PREFIXES.items()
}.items()
_update_default_namespace_prefixes_items()
cdef object _check_internal_prefix = re.compile(br"ns\d+$").match
def register_namespace(prefix, uri):
"""Registers a namespace prefix that newly created Elements in that
namespace will use. The registry is global, and any existing
mapping for either the given prefix or the namespace URI will be
removed.
"""
prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
if _check_internal_prefix(prefix_utf):
raise ValueError("Prefix format reserved for internal use")
_tagValidOrRaise(prefix_utf)
_uriValidOrRaise(uri_utf)
if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml'
or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"):
raise ValueError("Cannot change the 'xml' prefix of the XML namespace")
for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()):
if k == uri_utf or v == prefix_utf:
del _DEFAULT_NAMESPACE_PREFIXES[k]
_DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
_update_default_namespace_prefixes_items()
# Error superclass for ElementTree compatibility
class Error(Exception):
pass
# module level superclass for all exceptions
class LxmlError(Error):
"""Main exception base class for lxml.
All other exceptions inherit from this one.
"""
def __init__(self, message, error_log=None):
super(_Error, self).__init__(message)
if error_log is None:
self.error_log = __copyGlobalErrorLog()
else:
self.error_log = error_log.copy()
cdef object _Error = Error
# superclass for all syntax errors
class LxmlSyntaxError(LxmlError, SyntaxError):
"""Base class for all syntax errors.
"""
class C14NError(LxmlError):
"""Error during C14N serialisation.
"""
# version information
cdef tuple __unpackDottedVersion(version):
version_list = []
l = (version.decode("ascii").replace('–', '.').split('.') + [0]*4)[:4]
for item in l:
try:
item = int(item)
except ValueError:
if item.startswith('dev'):
count = item[3:]
item = –300
elif item.startswith('alpha'):
count = item[5:]
item = –200
elif item.startswith('beta'):
count = item[4:]
item = –100
else:
count = 0
if count:
item += int(count)
version_list.append(item)
return tuple(version_list)
cdef tuple __unpackIntVersion(int c_version, int base=100):
return (
((c_version // (base*base)) % base),
((c_version // base) % base),
(c_version % base)
)
cdef int _LIBXML_VERSION_INT
try:
_LIBXML_VERSION_INT = int(
re.match('[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0))
except Exception:
print("Unknown libxml2 version: " + (<unsigned char*>tree.xmlParserVersion).decode("latin1"))
_LIBXML_VERSION_INT = 0
LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
__version__ = tree.LXML_VERSION_STRING.decode("ascii")
cdef extern from *:
"""
#ifdef ZLIB_VERNUM
#define __lxml_zlib_version (ZLIB_VERNUM >> 4)
#else
#define __lxml_zlib_version 0
#endif
#ifdef _LIBICONV_VERSION
#define __lxml_iconv_version (_LIBICONV_VERSION << 8)
#else
#define __lxml_iconv_version 0
#endif
"""
# zlib isn't included automatically by libxml2's headers
#long ZLIB_HEX_VERSION "__lxml_zlib_version"
long LIBICONV_HEX_VERSION "__lxml_iconv_version"
#ZLIB_COMPILED_VERSION = __unpackIntVersion(ZLIB_HEX_VERSION, base=0x10)
ICONV_COMPILED_VERSION = __unpackIntVersion(LIBICONV_HEX_VERSION, base=0x100)[:2]
cdef extern from "libxml/xmlversion.h":
"""
static const char* const _lxml_lib_features[] = {
#ifdef LIBXML_HTML_ENABLED
"html",
#endif
#ifdef LIBXML_FTP_ENABLED
"ftp",
#endif
#ifdef LIBXML_HTTP_ENABLED
"http",
#endif
#ifdef LIBXML_CATALOG_ENABLED
"catalog",
#endif
#ifdef LIBXML_XPATH_ENABLED
"xpath",
#endif
#ifdef LIBXML_ICONV_ENABLED
"iconv",
#endif
#ifdef LIBXML_ICU_ENABLED
"icu",
#endif
#ifdef LIBXML_REGEXP_ENABLED
"regexp",
#endif
#ifdef LIBXML_SCHEMAS_ENABLED
"xmlschema",
#endif
#ifdef LIBXML_SCHEMATRON_ENABLED
"schematron",
#endif
#ifdef LIBXML_ZLIB_ENABLED
"zlib",
#endif
#ifdef LIBXML_LZMA_ENABLED
"lzma",
#endif
0
};
"""
const char* const* _LXML_LIB_FEATURES "_lxml_lib_features"
cdef set _copy_lib_features():
features = set()
feature = _LXML_LIB_FEATURES
while feature[0]:
features.add(feature[0].decode('ASCII'))
feature += 1
return features
LIBXML_COMPILED_FEATURES = _copy_lib_features()
LIBXML_FEATURES = {
feature_name for feature_id, feature_name in [
#XML_WITH_THREAD = 1
#XML_WITH_TREE = 2
#XML_WITH_OUTPUT = 3
#XML_WITH_PUSH = 4
#XML_WITH_READER = 5
#XML_WITH_PATTERN = 6
#XML_WITH_WRITER = 7
#XML_WITH_SAX1 = 8
(xmlparser.XML_WITH_FTP, "ftp"), # XML_WITH_FTP = 9
(xmlparser.XML_WITH_HTTP, "http"), # XML_WITH_HTTP = 10
#XML_WITH_VALID = 11
(xmlparser.XML_WITH_HTML, "html"), # XML_WITH_HTML = 12
#XML_WITH_LEGACY = 13
#XML_WITH_C14N = 14
(xmlparser.XML_WITH_CATALOG, "catalog"), # XML_WITH_CATALOG = 15
(xmlparser.XML_WITH_XPATH, "xpath"), # XML_WITH_XPATH = 16
#XML_WITH_XPTR = 17
#XML_WITH_XINCLUDE = 18
(xmlparser.XML_WITH_ICONV, "iconv"), # XML_WITH_ICONV = 19
#XML_WITH_ISO8859X = 20
#XML_WITH_UNICODE = 21
(xmlparser.XML_WITH_REGEXP, "regexp"), # XML_WITH_REGEXP = 22
#XML_WITH_AUTOMATA = 23
#XML_WITH_EXPR = 24
(xmlparser.XML_WITH_SCHEMAS, "xmlschema"), # XML_WITH_SCHEMAS = 25
(xmlparser.XML_WITH_SCHEMATRON, "schematron"), # XML_WITH_SCHEMATRON = 26
#XML_WITH_MODULES = 27
#XML_WITH_DEBUG = 28
#XML_WITH_DEBUG_MEM = 29
#XML_WITH_DEBUG_RUN = 30 # unused
(xmlparser.XML_WITH_ZLIB, "zlib"), # XML_WITH_ZLIB = 31
(xmlparser.XML_WITH_ICU, "icu"), # XML_WITH_ICU = 32
(xmlparser.XML_WITH_LZMA, "lzma"), # XML_WITH_LZMA = 33
] if xmlparser.xmlHasFeature(feature_id)
}
cdef bint HAS_ZLIB_COMPRESSION = xmlparser.xmlHasFeature(xmlparser.XML_WITH_ZLIB)
# class for temporary storage of Python references,
# used e.g. for XPath results
@cython.final
@cython.internal
cdef class _TempStore:
cdef list _storage
def __init__(self):
self._storage = []
cdef int add(self, obj) except –1:
self._storage.append(obj)
return 0
cdef int clear(self) except –1:
del self._storage[:]
return 0
# class for temporarily storing exceptions raised in extensions
@cython.internal
cdef class _ExceptionContext:
cdef object _exc_info
cdef int clear(self) except –1:
self._exc_info = None
return 0
cdef void _store_raised(self) noexcept:
try:
self._exc_info = sys.exc_info()
except BaseException as e:
self._store_exception(e)
finally:
return # and swallow any further exceptions
cdef int _store_exception(self, exception) except –1:
self._exc_info = (exception, None, None)
return 0
cdef bint _has_raised(self) except –1:
return self._exc_info is not None
cdef int _raise_if_stored(self) except –1:
if self._exc_info is None:
return 0
type, value, traceback = self._exc_info
self._exc_info = None
if value is None and traceback is None:
raise type
else:
raise type, value, traceback
# type of a function that steps from node to node
ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
################################################################################
# Include submodules
include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
include "apihelpers.pxi" # Private helper functions
include "xmlerror.pxi" # Error and log handling
################################################################################
# Public Python API
@cython.final
@cython.freelist(8)
cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
"""Internal base class to reference a libxml document.
When instances of this class are garbage collected, the libxml
document is cleaned up.
"""
cdef int _ns_counter
cdef bytes _prefix_tail
cdef xmlDoc* _c_doc
cdef _BaseParser _parser
def __dealloc__(self):
# if there are no more references to the document, it is safe
# to clean the whole thing up, as all nodes have a reference to
# the document
tree.xmlFreeDoc(self._c_doc)
cdef void initDict(self) noexcept:
self._parser.initDocDict(self._c_doc)
@cython.final
cdef getroot(self):
# return an element proxy for the document root
cdef xmlNode* c_node
c_node = tree.xmlDocGetRootElement(self._c_doc)
if c_node is NULL:
return None
return _elementFactory(self, c_node)
@cython.final
cdef bint hasdoctype(self) noexcept:
# DOCTYPE gets parsed into internal subset (xmlDTD*)
return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
@cython.final
cdef getdoctype(self):
# get doctype info: root tag, public/system ID (or None if not known)
cdef tree.xmlDtd* c_dtd
cdef xmlNode* c_root_node
public_id = None
sys_url = None
c_dtd = self._c_doc.intSubset
if c_dtd is not NULL:
if c_dtd.ExternalID is not NULL:
public_id = funicode(c_dtd.ExternalID)
if c_dtd.SystemID is not NULL:
sys_url = funicode(c_dtd.SystemID)
c_dtd = self._c_doc.extSubset
if c_dtd is not NULL:
if not public_id and c_dtd.ExternalID is not NULL:
public_id = funicode(c_dtd.ExternalID)
if not sys_url and c_dtd.SystemID is not NULL:
sys_url = funicode(c_dtd.SystemID)
c_root_node = tree.xmlDocGetRootElement(self._c_doc)
if c_root_node is NULL:
root_name = None
else:
root_name = funicode(c_root_node.name)
return root_name, public_id, sys_url
@cython.final
cdef getxmlinfo(self):
# return XML version and encoding (or None if not known)
cdef xmlDoc* c_doc = self._c_doc
if c_doc.version is NULL:
version = None
else:
version = funicode(c_doc.version)
if c_doc.encoding is NULL:
encoding = None
else:
encoding = funicode(c_doc.encoding)
return version, encoding
@cython.final
cdef isstandalone(self):
# returns True for "standalone=true",
# False for "standalone=false", None if not provided
if self._c_doc.standalone == –1:
return None
else:
return <bint>(self._c_doc.standalone == 1)
@cython.final
cdef bint ishtml(self):
return self._c_doc.type == tree.XML_HTML_DOCUMENT_NODE
@cython.final
cdef bytes buildNewPrefix(self):
# get a new unique prefix ("nsX") for this document
cdef bytes ns
if self._ns_counter < len(_PREFIX_CACHE):
ns = _PREFIX_CACHE[self._ns_counter]
else:
ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
if self._prefix_tail is not None:
ns += self._prefix_tail
self._ns_counter += 1
if self._ns_counter < 0:
# overflow!
self._ns_counter = 0
if self._prefix_tail is None:
self._prefix_tail = b"A"
else:
self._prefix_tail += b"A"
return ns
@cython.final
cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
const_xmlChar* c_href, const_xmlChar* c_prefix,
bint is_attribute) except NULL:
"""Get or create namespace structure for a node. Reuses the prefix if
possible.
"""
cdef xmlNs* c_ns
cdef xmlNs* c_doc_ns
cdef python.PyObject* dict_result
if c_node.type != tree.XML_ELEMENT_NODE:
assert c_node.type == tree.XML_ELEMENT_NODE, \
"invalid node type %d, expected %d" % (
c_node.type, tree.XML_ELEMENT_NODE)
# look for existing ns declaration
c_ns = _searchNsByHref(c_node, c_href, is_attribute)
if c_ns is not NULL:
if is_attribute and c_ns.prefix is NULL:
# do not put namespaced attributes into the default
# namespace as this would break serialisation
pass
else:
return c_ns
# none found => determine a suitable new prefix
if c_prefix is NULL:
dict_result = python.PyDict_GetItem(
_DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href)
if dict_result is not NULL:
prefix = <object>dict_result
else:
prefix = self.buildNewPrefix()
c_prefix = _xcstr(prefix)
# make sure the prefix is not in use already
while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
prefix = self.buildNewPrefix()
c_prefix = _xcstr(prefix)
# declare the namespace and return it
c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
if c_ns is NULL:
raise MemoryError()
return c_ns
@cython.final
cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except –1:
"Lookup namespace structure and set it for the node."
c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0)
tree.xmlSetNs(c_node, c_ns)
cdef tuple __initPrefixCache():
cdef int i
return tuple([ python.PyBytes_FromFormat("ns%d", i)
for i in range(26) ])
cdef tuple _PREFIX_CACHE = __initPrefixCache()
cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
cdef _Document result
result = _Document.__new__(_Document)
result._c_doc = c_doc
result._ns_counter = 0
result._prefix_tail = None
if parser is None:
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
result._parser = parser
return result
cdef object _find_invalid_public_id_characters = re.compile(
ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search
cdef class DocInfo:
"Document information provided by parser and DTD."
cdef _Document _doc
def __cinit__(self, tree):
"Create a DocInfo object for an ElementTree object or root Element."
self._doc = _documentOrRaise(tree)
root_name, public_id, system_url = self._doc.getdoctype()
if not root_name and (public_id or system_url):
raise ValueError, "Could not find root node"
@property
def root_name(self):
"""Returns the name of the root node as defined by the DOCTYPE."""
root_name, public_id, system_url = self._doc.getdoctype()
return root_name
@cython.final
cdef tree.xmlDtd* _get_c_dtd(self) noexcept:
""""Return the DTD. Create it if it does not yet exist."""
cdef xmlDoc* c_doc = self._doc._c_doc
cdef xmlNode* c_root_node
cdef const_xmlChar* c_name
if c_doc.intSubset:
return c_doc.intSubset
c_root_node = tree.xmlDocGetRootElement(c_doc)
c_name = c_root_node.name if c_root_node else NULL
return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL)
def clear(self):
"""Removes DOCTYPE and internal subset from the document."""
cdef xmlDoc* c_doc = self._doc._c_doc
cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset
if c_dtd is NULL:
return
tree.xmlUnlinkNode(c_dtd)
tree.xmlFreeNode(c_dtd)
property public_id:
"""Public ID of the DOCTYPE.
Mutable. May be set to a valid string or None. If a DTD does not
exist, setting this variable (even to None) will create one.
"""
def __get__(self):
root_name, public_id, system_url = self._doc.getdoctype()
return public_id
def __set__(self, value):
cdef xmlChar* c_value = NULL
if value is not None:
match = _find_invalid_public_id_characters(value)
if match:
raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.'
value = _utf8(value)
c_value = tree.xmlStrdup(_xcstr(value))
if not c_value:
raise MemoryError()
c_dtd = self._get_c_dtd()
if not c_dtd:
tree.xmlFree(c_value)
raise MemoryError()
if c_dtd.ExternalID:
tree.xmlFree(<void*>c_dtd.ExternalID)
c_dtd.ExternalID = c_value
property system_url:
"""System ID of the DOCTYPE.
Mutable. May be set to a valid string or None. If a DTD does not
exist, setting this variable (even to None) will create one.
"""
def __get__(self):
root_name, public_id, system_url = self._doc.getdoctype()
return system_url
def __set__(self, value):
cdef xmlChar* c_value = NULL
if value is not None:
bvalue = _utf8(value)
# sys_url may be any valid unicode string that can be
# enclosed in single quotes or quotes.
if b"'" in bvalue and b'"' in bvalue:
raise ValueError(
'System URL may not contain both single (\') and double quotes (").')
c_value = tree.xmlStrdup(_xcstr(bvalue))
if not c_value:
raise MemoryError()
c_dtd = self._get_c_dtd()
if not c_dtd:
tree.xmlFree(c_value)
raise MemoryError()
if c_dtd.SystemID:
tree.xmlFree(<void*>c_dtd.SystemID)
c_dtd.SystemID = c_value
@property
def xml_version(self):
"""Returns the XML version as declared by the document."""
xml_version, encoding = self._doc.getxmlinfo()
return xml_version
@property
def encoding(self):
"""Returns the encoding name as declared by the document."""
xml_version, encoding = self._doc.getxmlinfo()
return encoding
@property
def standalone(self):
"""Returns the standalone flag as declared by the document. The possible
values are True (“standalone='yes'“), False
(“standalone='no'“ or flag not provided in the declaration),
and None (unknown or no declaration found). Note that a
normal truth test on this value will always tell if the
“standalone“ flag was set to “'yes'“ or not.
"""
return self._doc.isstandalone()
@property
def is_html(self):
return self._doc.ishtml()
property URL:
"The source URL of the document (or None if unknown)."
def __get__(self):
if self._doc._c_doc.URL is NULL:
return None
return _decodeFilename(self._doc._c_doc.URL)
def __set__(self, url):
url = _encodeFilename(url)
c_oldurl = self._doc._c_doc.URL
if url is None:
self._doc._c_doc.URL = NULL
else:
self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url))
if c_oldurl is not NULL:
tree.xmlFree(<void*>c_oldurl)
@property
def doctype(self):
"""Returns a DOCTYPE declaration string for the document."""
root_name, public_id, system_url = self._doc.getdoctype()
if system_url:
# If '"' in system_url, we must escape it with single
# quotes, otherwise escape with double quotes. If url
# contains both a single quote and a double quote, XML
# standard is being violated.
if '"' in system_url:
quoted_system_url = f"'{system_url}'"
else:
quoted_system_url = f'"{system_url}"'
if public_id:
if system_url:
return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>'
else:
return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">'
elif system_url:
return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>'
elif self._doc.hasdoctype():
return f'<!DOCTYPE {root_name}>'
else:
return ''
@property
def internalDTD(self):
"""Returns a DTD validator based on the internal subset of the document."""
return _dtdFactory(self._doc._c_doc.intSubset)
@property
def externalDTD(self):
"""Returns a DTD validator based on the external subset of the document."""
return _dtdFactory(self._doc._c_doc.extSubset)
@cython.no_gc_clear
cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
"""Element class.
References a document object and a libxml node.
By pointing to a Document instance, a reference is kept to
_Document as long as there is some pointer to a node in it.
"""
cdef _Document _doc
cdef xmlNode* _c_node
cdef object _tag
def _init(self):
"""_init(self)
Called after object initialisation. Custom subclasses may override
this if they recursively call _init() in the superclasses.
"""
@cython.linetrace(False)
@cython.profile(False)
def __dealloc__(self):
#print("trying to free node:", <int>self._c_node)
#displayNode(self._c_node, 0)
if self._c_node is not NULL:
_unregisterProxy(self)
attemptDeallocation(self._c_node)
# MANIPULATORS
def __setitem__(self, x, value):
"""__setitem__(self, x, value)
Replaces the given subelement index or slice.
"""
cdef xmlNode* c_node = NULL
cdef xmlNode* c_next
cdef xmlDoc* c_source_doc
cdef _Element element
cdef bint left_to_right
cdef Py_ssize_t slicelength = 0, step = 0
_assertValidNode(self)
if value is None:
raise ValueError, "cannot assign None"
if isinstance(x, slice):
# slice assignment
_findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
if step > 0:
left_to_right = 1
else:
left_to_right = 0
step = –step if step != python.PY_SSIZE_T_MIN else python.PY_SSIZE_T_MAX
_replaceSlice(self, c_node, slicelength, step, left_to_right, value)
return
else:
# otherwise: normal item assignment
element = value
_assertValidNode(element)
c_node = _findChild(self._c_node, x)
if c_node is NULL:
raise IndexError, "list index out of range"
c_source_doc = element._c_node.doc
c_next = element._c_node.next
_removeText(c_node.next)
tree.xmlReplaceNode(c_node, element._c_node)
_moveTail(c_next, element._c_node)
moveNodeToDocument(self._doc, c_source_doc, element._c_node)
if not attemptDeallocation(c_node):
moveNodeToDocument(self._doc, c_node.doc, c_node)
def __delitem__(self, x):
"""__delitem__(self, x)
Deletes the given subelement or a slice.
"""
cdef xmlNode* c_node = NULL
cdef xmlNode* c_next
cdef Py_ssize_t step = 0, slicelength = 0
_assertValidNode(self)
if isinstance(x, slice):
# slice deletion
if _isFullSlice(<slice>x):
c_node = self._c_node.children
if c_node is not NULL:
if not _isElement(c_node):
c_node = _nextElement(c_node)
while c_node is not NULL:
c_next = _nextElement(c_node)
_removeNode(self._doc, c_node)
c_node = c_next
else:
_findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
_deleteSlice(self._doc, c_node, slicelength, step)
else:
# item deletion
c_node = _findChild(self._c_node, x)
if c_node is NULL:
raise IndexError, f"index out of range: {x}"
_removeNode(self._doc, c_node)
def __deepcopy__(self, memo):
"__deepcopy__(self, memo)"
return self.__copy__()
def __copy__(self):
"__copy__(self)"
cdef xmlDoc* c_doc
cdef xmlNode* c_node
cdef _Document new_doc
_assertValidNode(self)
c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
new_doc = _documentFactory(c_doc, self._doc._parser)
root = new_doc.getroot()
if root is not None:
return root
# Comment/PI
c_node = c_doc.children
while c_node is not NULL and c_node.type != self._c_node.type:
c_node = c_node.next
if c_node is NULL:
return None
return _elementFactory(new_doc, c_node)
def set(self, key, value):
"""set(self, key, value)
Sets an element attribute.
In HTML documents (not XML or XHTML), the value None is allowed and creates
an attribute without value (just the attribute name).
"""
_assertValidNode(self)
_setAttributeValue(self, key, value)
def append(self, _Element element not None):
"""append(self, element)
Adds a subelement to the end of this element.
"""
_assertValidNode(self)
_assertValidNode(element)
_appendChild(self, element)
def addnext(self, _Element element not None):
"""addnext(self, element)
Adds the element as a following sibling directly after this
element.
This is normally used to set a processing instruction or comment after
the root node of a document. Note that tail text is automatically
discarded when adding at the root level.
"""
_assertValidNode(self)
_assertValidNode(element)
if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
if element._c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE):
raise TypeError, "Only processing instructions and comments can be siblings of the root element"
element.tail = None
_appendSibling(self, element)
def addprevious(self, _Element element not None):
"""addprevious(self, element)
Adds the element as a preceding sibling directly before this
element.
This is normally used to set a processing instruction or comment
before the root node of a document. Note that tail text is
You can’t perform that action at this time.