root/eridanus/iriparse.py

Revision 189, 5.1 kB (checked in by Jonathan Jacobs <korpse@…>, 17 months ago)

Temporary workaround for newly modified PyMeta? behaviour.
Ignore-this: ffde48f4620f549c77c397b2835260e3

Line 
1import string
2
3from pymeta.grammar import OMeta
4from pymeta.runtime import ParseError
5
6from nevow import url
7
8# http://www.ietf.org/rfc/rfc3987.txt
9
10def isInRange(c, ranges):
11    o = ord(c)
12    for start, end in ranges:
13        if start <= o <= end:
14            return True
15    return False
16
17
18def checkIPv6(xs):
19    return len([x for x in xs if x is None]) <= 1
20
21
22iriGrammar = """
23hexdigit       ::= :x ?(x in string.hexdigits) => x
24
25IRI            ::= !(self.markURLStart()) <scheme> ':' <ihier_part> ('?' <iquery>)? ('#' <ifragment>)? !(self.markURLEnd()) => ''.join(self.input.data[self.urlStart:self.urlEnd])
26
27ihier_part     ::= '/' '/' <iauthority> (<ipath_abempty> | <ipath_absolute> | <ipath_rootless>)?
28
29IRI_reference  ::= <IRI> | <irelative_ref>
30
31absolute_IRI   ::= !(self.markURLStart()) <scheme> ':' <ihier_part> ('?' <iquery>)? !(self.markURLEnd()) => ''.join(self.input.data[self.urlStart:self.urlEnd])
32
33irelative_ref  ::= <irelative_part> ('?' <iquery>)? ('#' <ifragment>)?
34irelative_part ::= '/' '/' <iauthority> (<ipath_abempty> | <ipath_absolute> | <ipath_noscheme>)
35
36iauthority     ::= (<iuserinfo> '@')? <ihost> (':' <port>)?
37
38iuserinfo      ::= (<iunreserved> | <pct_encoded> | <sub_delims> | ':')*
39
40ihost          ::= <IP_literal> | <IPv4address> | <ireg_name>
41
42ireg_name      ::= (<iunreserved> | <pct_encoded> | <sub_delims>)+
43
44ipath          ::= <ipath_abempty> | <ipath_absolute> | <ipath_noscheme> | <ipath_rootless>
45
46ipath_abempty  ::= ('/' <isegment>)*
47ipath_absolute ::= '/' (<isegment_nz> <ipath_abempty>)?
48ipath_noscheme ::= <isegment_nz_nc> <ipath_abempty>
49ipath_rootless ::= <isegment_nz> <ipath_abempty>
50
51isegment       ::= <ipchar>*
52isegment_nz    ::= <ipchar>+
53isegment_nz_nc ::= (<iunreserved> | <pct_encoded> | <sub_delims> | '@')+
54
55ipchar         ::= <iunreserved> | <pct_encoded> | <sub_delims> | ':' | '@'
56
57iquery         ::= (<ipchar> | <iprivate> | '/' | '?')+
58
59ifragment      ::= (<ipchar> | '/' | '?')+
60
61iunreserved    ::= <unreserved> | <ucschar>
62
63ucschar        ::= :x ?(isInRange(x, self.ucscharRanges)) => x
64
65iprivate       ::= :x ?(isInRange(x, self.iprivateRanges)) => x
66
67scheme         ::= <letter> (<letterOrDigit> | '+' | '-' | '.')+
68
69port           ::= <digit>+
70
71IP_literal     ::= '[' (<IPv6address> | <IPvFuture>) ']'
72
73IPvFuture      ::= 'v' <hexdigit>+ '.' (<unreserved> | <sub_delims> | ':')+
74
75IPv6address    ::= (<h16>?:x ':' => x)*:xs ?(checkIPv6(xs)) <h16>
76
77h16            ::= <hexdigit>*:hs ?(1 <= len(hs) <= 4) => ''.join(hs)
78ls32           ::= (<h16> ':' <h16>) | <IPv4address>
79
80IPv4address    ::= <dec_octet> '.' <dec_octet> '.' <dec_octet> '.' <dec_octet>
81
82dec_octet      ::= :x ?(0 <= x <= 255) => x
83
84pct_encoded    ::= '%' <hexdigit> <hexdigit>
85
86unreserved     ::= <letterOrDigit> | '-' | '.' | '_' | '~'
87reserved       ::= <gen_delims> | <sub_delims>
88gen_delims     ::= ':' | '/' | '?' | '#' | '[' | ']' | '@'
89sub_delims     ::= '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
90"""
91
92
93class IRIGrammar(OMeta.makeGrammar(iriGrammar, globals())):
94    iprivateRanges = [(0x0000e000, 0x0000f8ff),
95                      (0x000f0000, 0x000ffffd),
96                      (0x00100000, 0x0010fffd)]
97
98    ucscharRanges  = [(0x000000a0, 0x0000d7ff),
99                      (0x0000f900, 0x0000fdcf),
100                      (0x0000fdf0, 0x0000ffef),
101                      (0x00010000, 0x0001fffd),
102                      (0x00020000, 0x0002fffd),
103                      (0x00030000, 0x0003fffd),
104                      (0x00040000, 0x0004fffd),
105                      (0x00050000, 0x0005fffd),
106                      (0x00060000, 0x0006fffd),
107                      (0x00070000, 0x0007fffd),
108                      (0x00080000, 0x0008fffd),
109                      (0x00090000, 0x0009fffd),
110                      (0x000a0000, 0x000afffd),
111                      (0x000b0000, 0x000bfffd),
112                      (0x000c0000, 0x000cfffd),
113                      (0x000d0000, 0x000dfffd),
114                      (0x000e0000, 0x000efffd)]
115
116    def markURLStart(self):
117        self.urlStart = self.input.position
118
119    def markURLEnd(self):
120        self.urlEnd = self.input.position
121
122
123def extractURL(input):
124    g = IRIGrammar(input)
125    uri = g.apply('IRI')
126    return uri, g.urlEnd
127
128
129def extractURLsWithPosition(input, supportedSchemes=None):
130    if supportedSchemes is None:
131        supportedSchemes = ['http']
132
133    for scheme in supportedSchemes:
134        pos = 0
135        while True:
136            try:
137                pos = input.index(scheme, pos)
138                uri, inc = extractURL(input[pos:])
139                pos += inc
140                yield uri, pos
141            except ParseError:
142                # Attempt to skip over the broken IRI.
143                pos += 1
144            except ValueError:
145                break
146
147
148def extractURLs(input, supportedSchemes=None):
149    for uri, _ in extractURLsWithPosition(input, supportedSchemes):
150        yield uri
151
152
153def parseURL(input):
154    uri, = extractURL(input)
155    # TODO: Actually parse the URL components ourself.
156    return url.URL.fromString(uri)
157
158
159def parseURLs(input, supportedSchemes=None):
160    for uri in extractURLs(input, supportedSchemes):
161        yield url.URL.fromString(uri)
Note: See TracBrowser for help on using the browser.