
    ,gLb                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
ddlmZmZmZ  ed eD              Z ed eD              Z ed	 e	D              Ze ed
dg      z  Z ej(                  d      Zh dZ ej(                  d      Zi Zd Z G d d      Z G d de      Z G d de      Z G d d      Z G d d      Zd Z y)    N)BytesIOStringIO)Path)ascii_lettersascii_uppercase   )EOFReparseErrorspace_charactersc              #   <   K   | ]  }|j                           y wNencode.0items     m/home/viktor/gitlab-persoonlijk/factuur-applicatie/venv/lib/python3.12/site-packages/tinyhtml5/inputstream.py	<genexpr>r      s     "NT4;;="N   c              #   <   K   | ]  }|j                           y wr   r   r   s     r   r   r      s     H$Hr   c              #   <   K   | ]  }|j                           y wr   r   r   s     r   r   r      s     !LD$++-!Lr      >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿-]>            	 
               	 
       z[	- -/:-@\[-`{-~]c                    t        | t              rJt        |       dk  r<t        |       j	                         r#t        t        |       j                         fi |S t        | t              rt        | j                         fi |S t        t        | d      r| j                  d      n| t              rt        | fi |S t        | fi |S )N   readr   )

isinstancestrlenr   is_fileHTMLUnicodeInputStream	read_texthasattrr<   HTMLBinaryInputStream)sourcekwargss     r   HTMLInputStreamrG   &   s    &#3v;#4f9M9M9O%d6l&<&<&>I&II	FD	!%f&6&6&8CFCC	gff&=FKKN63	O%f777$V6v66    c                   H    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
dd	Zd
 Zy)rA   zProvides a Unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    c                     dg| _         t        d      df| _        | j                  |      | _        | j                          y)a  Initialise the HTMLInputStream.

        Create a normalized stream from source for use by tinyhtml5.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element).

        r   utf-8certainN)	new_lineslookup_encodingencodingopen_streamstreamresetselfrE   s     r   __init__zHTMLUnicodeInputStream.__init__9   s9     (19=&&v.

rH   c                 f    d| _         d| _        d| _        g | _        d| _        d| _        d | _        y )N r   )chunk
chunk_sizechunk_offseterrorsprevious_number_linesprevious_number_columns_buffered_characterrT   s    r   rR   zHTMLUnicodeInputStream.resetN   s;    
 &'"'($ $( rH   c                 4    t        |d      r|S t        |      S )zuProduce a file object from source.

        source can be either a file object, local filename or a string.

        r<   )rC   r   rS   s     r   rP   z"HTMLUnicodeInputStream.open_stream\   s     !0vFhv6FFrH   c                     | j                   }|j                  dd|      }| j                  |z   }|j                  dd|      }|dk(  r| j                  |z   }||fS ||dz   z
  }||fS )N
r   r   )rX   countr\   rfindr]   )rT   offsetrX   number_linesposition_linelast_line_positionposition_columns          r   	_positionz HTMLUnicodeInputStream._positiond   s    

{{4F322\A"[[q&9#"::VCO // %(:Q(>?O//rH   c                 L    | j                  | j                        \  }}|dz   |fS )z9Return (line, col) of the current position in the stream.r   )rk   rZ   )rT   linecolumns      r   positionzHTMLUnicodeInputStream.positiono   s)    ~~d&7&78fq&!!rH   c                     | j                   | j                  k\  r| j                         st        S | j                   }| j                  |   }|dz   | _         |S )zlRead one character from the stream or queue if available.

        Return EOF when EOF is reached.

        r   )rZ   rY   
read_chunkr	   rX   )rT   rZ   	characters      r   rr   z HTMLUnicodeInputStream.charactert   sQ     /??$
((JJ|,	(1,rH   c                    | j                  | j                        \  | _        | _        d| _        d| _        d| _        | j                  j                  d      }| j                  r| j                  |z   }d | _        n|syt        |      dkD  r0t        |d         }|dk(  sd|cxk  rd	k  rn n|d   | _        |d d }t        t        t        j                  |                  D ]  }| j                  j                  d
        |j!                  dd      }|j!                  dd      }|| _        t        |      | _        y)NrW   r   i (  Fr   rc      i   i  zinvalid-codepointz
rb   T)rk   rY   r\   r]   rX   rZ   rQ   r<   r^   r?   ordrangeinvalid_unicode_refindallr[   appendreplace)rT   datalast_s       r   rq   z!HTMLUnicodeInputStream.read_chunk   s+   CG>>OOD@"D$@ 
{{& ##++d2D'+D$t9q=tBx=Dt|v77+/8(CRy s-55d;<= 	4AKK23	4 ||FD)||D$'
d)rH   c           
         	 t         ||f   }g }	 |j                  | j                  | j                        }|| j                  | j                  k7  rPn|j                         }|| j                  k7  r0|j                  | j                  | j                  |        || _        n:|j                  | j                  | j                  d        | j                         sndj                  |      S # t        $ rc dj                  |D cg c]  }dt        |      d nc c}w c}      }|sd| }t	        j
                  d| d      }|x}t         ||f<   Y Jw xY w)a   Return a string of characters from the stream.

        String goes up to but does not include any character in 'characters' or
        EOF. 'characters' must be a container that supports the 'in' method and
        iteration over its characters.

        rW   z\x02x^[z]+N)characters_until_regexKeyErrorjoinrv   recompilematchrX   rZ   rY   endrz   rq   )rT   
charactersoppositerr   regexresultr   r   s           r   chars_untilz"HTMLUnicodeInputStream.chars_until   sS   	P/X0FGJ $$TZZ1B1BCE} $$7iik $//)MM$**T->->s"CD(+D% MM$**T%6%6%789??$+ . wwvA  	PGGTIs3y>#"67TTUEE7JJ5'}-EJOOJ/X0FG	Ps   C, ,ED
:EEc                     |t         urm| j                  dk(  r*|| j                  z   | _        | xj                  dz  c_        y | xj                  dz  c_        | j                  | j                     |k(  sJ y y )Nr   r   )r	   rZ   rX   rY   )rT   chars     r   ungetzHTMLUnicodeInputStream.unget   sl     s?  A%
 "DJJ.
1$!!Q&!zz$"3"34<<< rH   N)F)__name__
__module____qualname____doc__rU   rR   rP   rk   ro   rr   rq   r   r    rH   r   rA   rA   1   s7    *(G	0"
"#J,\=rH   rA   c                   N     e Zd ZdZ	 	 	 d	dZ fdZd Zd Zd Zd Z	d Z
 xZS )
rD   zProvide a binary stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    c                     | j                  |      | _        d| _        || _        || _        || _        || _        || _        | j                         | _	        | j                  d   J | j                          y )Ni   r   )rP   
raw_streamnumber_bytes_metaoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermine_encodingrO   rR   )rT   rE   r   r   r   r   r   s          r   rU   zHTMLBinaryInputStream.__init__   sz    
 **62
 "&!2"4+F(. 0 //1}}Q+++ 	

rH   c                     | j                   d   j                  j                  } || j                  d      | _        t
        |           y )Nr   r{   )rO   
codec_infostreamreaderr   rQ   superrR   )rT   r   	__class__s     r   rR   zHTMLBinaryInputStream.reset	  s8    }}Q'22??"4??I>rH   c                     t        |d      r.t        |d      r|j                         r|S |j                         }t        |      S )Nr<   seekable)rC   r   r<   r   rS   s     r   rP   z!HTMLBinaryInputStream.open_stream  s9    66"vz*v/@[[]FvrH   c                    | j                         df}|d   |S t        | j                        df}|d   |S t        | j                        df}|d   |S | j	                         df}|d   |S t        | j
                        df}|d    |d   j                  j                  d      s|S t        | j                        df}|d   |S t        | j                        df}|d   |S t        d      dfS )NrL   r   	tentativezutf-16windows-1252)

detect_bomrN   r   r   detect_encoding_metar   name
startswithr   r   )rT   rO   s     r   r   z(HTMLBinaryInputStream.determine_encoding  s/    ??$i/A;"O #4#9#9:IEA;"O #4#:#:;YFA;"O ,,.;A;"O #4#C#CDkQA;"8A;+;+;+F+Fx+PO #4#7#78+EA;"O #4#8#89;FA;"O ~.;;rH   c                 x   | j                   d   dk7  sJ t        |      x}y |j                  dv rt        d      }|J y || j                   d   k(  r| j                   d   df| _         y | j                  j	                  d       |df| _         | j                          t        d| j                   d    d|       )Nr   rL   utf-16beutf-16lerK   r   zEncoding changed from z to )rO   rN   r   r   seekrR   r
   )rT   new_encodings     r   change_encodingz%HTMLBinaryInputStream.change_encoding=  s    }}Q9,,,+L99LB 88*73L+++T]]1--!]]1-y9DMOO  #)95DMJJL(q)9(:$|nMO OrH   c           
         t         j                  dt         j                  dt         j                  dt         j                  dt         j
                  di}| j                  j                  d      }t        |t              sJ dD ]@  }|j                  |d|       x}s| j                  j                  |       t        |      c S  | j                  j                  d	       y)
zAttempt to detect at BOM at the start of the stream.

        If an encoding can be determined from the BOM return the name of the
        encoding otherwise return None.

        rK   r   r   zutf-32lezutf-32be   )   r      Nr   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r<   r=   bytesgetr   rN   )rT   bomsstringr   rO   s        r   r   z HTMLBinaryInputStream.detect_bomM  s     OOW
 %%a(&%(((  	1D88F5DM22x2$$T*&x00		1 	QrH   c                    | j                   j                  | j                        }t        |t              sJ t        |      }| j                   j                  d       |j                         }||j                  dv rt        d      }|S )z1Report the encoding declared by the meta element.r   r   rK   )
r   r<   r   r=   r   EncodingParserr   get_encodingr   rN   )rT   bufferparserrO   s       r   r   z*HTMLBinaryInputStream.detect_encoding_metaj  sw    %%d&<&<=&%((('Q&&(HMM5M$M&w/HrH   )NNNNr   )r   r   r   r   rU   rR   rP   r   r   r   r   __classcell__)r   s   @r   rD   rD      s:     KOCG"02
&<PO  :rH   rD   c                   r    e Zd ZdZd Zd Zd Zd Zd Zd Z	 e
e	e      Ze
d        Zefd	Zd
 Zd Zd Zy)EncodingByteszBytes-like object with an associated position and various extra methods.

    If the position is ever greater than the string length then an exception is
    raised.

    c                 n    t        |t              sJ t        j                  | |j                               S r   )r=   r   __new__lower)clsvalues     r   r   zEncodingBytes.__new__  s)    %'''}}S%++-00rH   c                     d| _         y )Nrc   rk   )rT   r   s     r   rU   zEncodingBytes.__init__  s	    rH   c                 f    | j                   dz   x}| _         |t        |       k\  rt        | ||dz    S Nr   rk   r?   StopIterationrT   ro   s     r   __next__zEncodingBytes.__next__  s:    $(NNQ$664>s4y HX\**rH   c                 >    | j                   dz
  x| _         }| ||dz    S r   r   r   s     r   previouszEncodingBytes.previous  s(    $(NNQ$66HX\**rH   c                 b    | j                   t        |       k\  rt        t        d|      | _         y Nr   )rk   r?   r   maxr   s     r   set_positionzEncodingBytes.set_position  s&    >>SY&Q)rH   c                 v    | j                   t        |       k\  rt        | j                   dk\  r| j                   S y r   r   r_   s    r   get_positionzEncodingBytes.get_position  s3    >>SY&>>Q>>! rH   c                 :    | | j                   | j                   dz    S r   )ro   r_   s    r   current_bytezEncodingBytes.current_byte  s    DMM$--!"344rH   c                     | j                   }|t        |       k  r)| ||dz    }||vr	|| _        |S |dz  }|t        |       k  r)|| _        y)zSkip past a list of characters.r   Nro   r?   rk   rT   r   ro   rr   s       r   skipzEncodingBytes.skip  s_    ==T"Xhl3I
*!)  MH T" "rH   c                     | j                   }|t        |       k  r)| ||dz    }||v r	|| _        |S |dz  }|t        |       k  r)|| _        y r   r   r   s       r   
skip_untilzEncodingBytes.skip_until  s_    ==T"Xhl3IJ&!)  MH T" "rH   c                 ~    | j                  || j                        x}r| xj                  t        |      z  c_        |S )zLook for a sequence of bytes at the start of a string.

        If the bytes are found return True and advance the position to the byte
        after the match. Otherwise return False and leave the position alone.

        )r   ro   r?   )rT   r   r   s      r   match_byteszEncodingBytes.match_bytes  s5     __UDMM::6:MMSZ'MrH   c                     	 | j                  || j                        t        |      z   dz
  | _        y# t        $ r t
        w xY w)zLook for the next sequence of bytes matching a given sequence.

        If a match is found advance the position to the last byte of the match.

        r   T)indexro   r?   rk   
ValueErrorr   )rT   r   s     r   jump_tozEncodingBytes.jump_to  sG    	 !ZZt}}=E
JQNDN   	 	 s	   03 AN)r   r   r   r   r   rU   r   r   r   r   propertyro   r   space_characters_bytesr   r   r   r   r   rH   r   r   r   x  s^    1++*
" l3H5 5 5 
		
rH   r   c                   F    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zy)r   z@Mini parser for detecting character encoding from meta elements.c                 2    t        |      | _        d | _        y r   )r   r|   rO   rT   r|   s     r   rU   zEncodingParser.__init__  s    !$'	rH   c                    d| j                   vry | j                  | j                  | j                  | j                  | j                  | j
                  d}| j                   D ]l  }d}	 | j                   j                  d       |j                         D ]*  \  }}| j                   j                  |      s"	  |       } n |ra | j                  S  | j                  S # t        $ r Y  | j                  S w xY w# t        $ r d}Y  Fw xY w)N   <meta)s   <!--r   s   </s   <!s   <?r   Tr   F)r|   handle_commenthandle_metahandle_possible_end_taghandle_otherhandle_possible_start_tagr   r   itemsr   rO   )rT   method_dispatchr~   keep_parsingkeymethods         r   r   zEncodingParser.get_encoding  s   499$ ((&&//$$$$00
  	AL		!!$'  /446 V99((-'-x	  }}#	" }} !  }} ) ',s$   'C5C7	C43C47DDc                 8    | j                   j                  d      S )zSkip over comments.s   -->r|   r   r_   s    r   r   zEncodingParser.handle_comment  s    yy  ((rH   c                 ~   | j                   j                  t        vryd}d }	 | j                         x}y|d   dk(  r|d   dk(  }|rv|t|| _        y|d   dk(  r|d   }t        |      }|R|| _        y|d   dk(  rBt        t        |d               }|j                         x}t        |      }||r|| _        y|})	NTFr   s
   http-equivr   s   content-type   charsets   content)	r|   r   r   get_attributerO   rN   ContentAttributeParserr   parse)rT   
has_pragmapending_encoding	attributetentative_encodingcodeccontent_parsers          r   r   zEncodingParser.handle_meta  s    99!!)??
!//11	:|},&q\_<
"2">$4DM 1+%.q\"'(:;$$)DM 1+!7iPQl8S!T*8*>*>*@@&M+,>?E(%,1DM#(+0(1 rH   c                 &    | j                  d      S )NFend_tag)handle_possible_tagr_   s    r   r   z(EncodingParser.handle_possible_start_tag  s    '''66rH   c                 P    t        | j                         | j                  d      S )NTr  )nextr|   r  r_   s    r   r   z&EncodingParser.handle_possible_end_tag  s!    TYY'''55rH   c                    | j                   }|j                  t        vr#|r |j                          | j	                          y|j                  t              }|dk(  r|j                          y	 | j                         	 y)NTr   )r|   r   ascii_letters_bytesr   r   r   spaces_angle_bracketsr  )rT   r  r|   rr   s       r   r  z"EncodingParser.handle_possible_tag#  s}    yy$77 !!#OO$9:	 MMO  %%'/ rH   c                 8    | j                   j                  d      S )Nr   r  r_   s    r   r   zEncodingParser.handle_other:  s    yy  &&rH   c                    | j                   }|j                  t        t        dg      z        }|t	        |      dk(  sJ |dv ryg }g }	 |dk(  r|rnx|t        v r|j                         }n_|dv rdj                  |      dfS |t        v r |j                  |j                                n|y|j                  |       t        |      }|dk7  r#|j                          dj                  |      dfS t        |       |j                         }|x}dv rx	 t        |      }||k(  r-t        |       dj                  |      dj                  |      fS |t        v r |j                  |j                                n|j                  |       w|d	k(  rdj                  |      dfS |t        v r |j                  |j                                n|y|j                  |       	 t        |      }|t        v r"dj                  |      dj                  |      fS |t        v r |j                  |j                                n|y|j                  |       r)
z{Return a (name, value) pair for the next attribute in the stream.

        If no attribute is found, return None.

           /Nr   )r   N   =)r  r   rH   )   '   "r   )r|   r   r   	frozensetr?   r   ascii_uppercase_bytesrz   r   r  r   r  )rT   r|   rr   attribute_nameattribute_valuequotes         r   r  zEncodingParser.get_attribute=  sJ    yyII4y$7HHI	 C	Na$777$D ^44 IIK	l*xx/4433%%ioo&78"%%i0T
I! $ MMO88N+S00T
IIK	E</ J	%J88N3SXXo5NNN"77#**9??+<= $**95  $88N+S00//""9??#45""9-T
I11xx//1JJJ33&&y'89"&&y1 rH   N)r   r   r   r   rU   r   r   r   r   r   r  r   r  r   rH   r   r   r     s4    J>)1B76.'L2rH   r   c                       e Zd Zd Zd Zy)r  c                 6    t        |t              sJ || _        y r   )r=   r   r|   r   s     r   rU   zContentAttributeParser.__init__  s    $&&&	rH   c                    	 | j                   j                  d       | j                   xj                  dz  c_        | j                   j                          | j                   j                  dk(  sy | j                   xj                  dz  c_        | j                   j                          | j                   j                  dv r| j                   j                  }| j                   xj                  dz  c_        | j                   j                  }| j                   j                  |      r#| j                   || j                   j                   S y | j                   j                  }	 | j                   j                  t               | j                   || j                   j                   S # t        $ r | j                   |d  cY S w xY w# t        $ r Y y w xY w)Nr  r   r  )r  r  )r|   r   ro   r   r   r   r   r   )rT   r   old_positions      r   r  zContentAttributeParser.parse  s`   	IIj)II!#IINN99))T1II!#IINNyy%%5		..		""a'"#yy1199$$U+99\$))2D2DEE  $yy114II(()?@99\$))2D2DEE$ 499\]334  		s>   A-G 0CG G "AF$ $F?<G >F??G 	GGN)r   r   r   rU   r  r   rH   r   r  r    s    rH   r  c                     t        | t              r	 | j                  d      } | 	 t	        j
                  |       S y# t        $ r Y yw xY w# t        $ r Y yw xY w)zReturn the Python codec name corresponding to an encoding.

    Return None if the string doesn't correspond to a valid encoding.

    asciiN)r=   r   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)rO   s    r   rN   rN     sk     (E"	w/H 	&&x00  " 		  		s    < A 	AA	AA)!r   r   ior   r   pathlibr   r   r   r   r)  	constantsr	   r
   r   r  r   r  r  r  r   rx   non_bmp_invalid_codepointsascii_punctuation_rer   rG   rA   rD   r   r   r   r  rN   r   rH   r   <module>r1     s     	    1  : : #"N=M"NN H-HH !!LO!LL .D$<1HH RZZ=> N  "rzzOQ   7t= t=nM2 M`VE Vrx2 x2v# #LrH   