| 1 | import string |
|---|
| 2 | |
|---|
| 3 | from pymeta.grammar import OMeta |
|---|
| 4 | from pymeta.runtime import ParseError |
|---|
| 5 | |
|---|
| 6 | from nevow import url |
|---|
| 7 | |
|---|
| 8 | # http://www.ietf.org/rfc/rfc3987.txt |
|---|
| 9 | |
|---|
| 10 | def isInRange(c, ranges): |
|---|
| 11 | o = ord(c) |
|---|
| 12 | for start, end in ranges: |
|---|
| 13 | if start <= o <= end: |
|---|
| 14 | return True |
|---|
| 15 | return False |
|---|
| 16 | |
|---|
| 17 | |
|---|
| 18 | def checkIPv6(xs): |
|---|
| 19 | return len([x for x in xs if x is None]) <= 1 |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | iriGrammar = """ |
|---|
| 23 | hexdigit ::= :x ?(x in string.hexdigits) => x |
|---|
| 24 | |
|---|
| 25 | IRI ::= !(self.markURLStart()) <scheme> ':' <ihier_part> ('?' <iquery>)? ('#' <ifragment>)? !(self.markURLEnd()) => ''.join(self.input.data[self.urlStart:self.urlEnd]) |
|---|
| 26 | |
|---|
| 27 | ihier_part ::= '/' '/' <iauthority> (<ipath_abempty> | <ipath_absolute> | <ipath_rootless>)? |
|---|
| 28 | |
|---|
| 29 | IRI_reference ::= <IRI> | <irelative_ref> |
|---|
| 30 | |
|---|
| 31 | absolute_IRI ::= !(self.markURLStart()) <scheme> ':' <ihier_part> ('?' <iquery>)? !(self.markURLEnd()) => ''.join(self.input.data[self.urlStart:self.urlEnd]) |
|---|
| 32 | |
|---|
| 33 | irelative_ref ::= <irelative_part> ('?' <iquery>)? ('#' <ifragment>)? |
|---|
| 34 | irelative_part ::= '/' '/' <iauthority> (<ipath_abempty> | <ipath_absolute> | <ipath_noscheme>) |
|---|
| 35 | |
|---|
| 36 | iauthority ::= (<iuserinfo> '@')? <ihost> (':' <port>)? |
|---|
| 37 | |
|---|
| 38 | iuserinfo ::= (<iunreserved> | <pct_encoded> | <sub_delims> | ':')* |
|---|
| 39 | |
|---|
| 40 | ihost ::= <IP_literal> | <IPv4address> | <ireg_name> |
|---|
| 41 | |
|---|
| 42 | ireg_name ::= (<iunreserved> | <pct_encoded> | <sub_delims>)+ |
|---|
| 43 | |
|---|
| 44 | ipath ::= <ipath_abempty> | <ipath_absolute> | <ipath_noscheme> | <ipath_rootless> |
|---|
| 45 | |
|---|
| 46 | ipath_abempty ::= ('/' <isegment>)* |
|---|
| 47 | ipath_absolute ::= '/' (<isegment_nz> <ipath_abempty>)? |
|---|
| 48 | ipath_noscheme ::= <isegment_nz_nc> <ipath_abempty> |
|---|
| 49 | ipath_rootless ::= <isegment_nz> <ipath_abempty> |
|---|
| 50 | |
|---|
| 51 | isegment ::= <ipchar>* |
|---|
| 52 | isegment_nz ::= <ipchar>+ |
|---|
| 53 | isegment_nz_nc ::= (<iunreserved> | <pct_encoded> | <sub_delims> | '@')+ |
|---|
| 54 | |
|---|
| 55 | ipchar ::= <iunreserved> | <pct_encoded> | <sub_delims> | ':' | '@' |
|---|
| 56 | |
|---|
| 57 | iquery ::= (<ipchar> | <iprivate> | '/' | '?')+ |
|---|
| 58 | |
|---|
| 59 | ifragment ::= (<ipchar> | '/' | '?')+ |
|---|
| 60 | |
|---|
| 61 | iunreserved ::= <unreserved> | <ucschar> |
|---|
| 62 | |
|---|
| 63 | ucschar ::= :x ?(isInRange(x, self.ucscharRanges)) => x |
|---|
| 64 | |
|---|
| 65 | iprivate ::= :x ?(isInRange(x, self.iprivateRanges)) => x |
|---|
| 66 | |
|---|
| 67 | scheme ::= <letter> (<letterOrDigit> | '+' | '-' | '.')+ |
|---|
| 68 | |
|---|
| 69 | port ::= <digit>+ |
|---|
| 70 | |
|---|
| 71 | IP_literal ::= '[' (<IPv6address> | <IPvFuture>) ']' |
|---|
| 72 | |
|---|
| 73 | IPvFuture ::= 'v' <hexdigit>+ '.' (<unreserved> | <sub_delims> | ':')+ |
|---|
| 74 | |
|---|
| 75 | IPv6address ::= (<h16>?:x ':' => x)*:xs ?(checkIPv6(xs)) <h16> |
|---|
| 76 | |
|---|
| 77 | h16 ::= <hexdigit>*:hs ?(1 <= len(hs) <= 4) => ''.join(hs) |
|---|
| 78 | ls32 ::= (<h16> ':' <h16>) | <IPv4address> |
|---|
| 79 | |
|---|
| 80 | IPv4address ::= <dec_octet> '.' <dec_octet> '.' <dec_octet> '.' <dec_octet> |
|---|
| 81 | |
|---|
| 82 | dec_octet ::= :x ?(0 <= x <= 255) => x |
|---|
| 83 | |
|---|
| 84 | pct_encoded ::= '%' <hexdigit> <hexdigit> |
|---|
| 85 | |
|---|
| 86 | unreserved ::= <letterOrDigit> | '-' | '.' | '_' | '~' |
|---|
| 87 | reserved ::= <gen_delims> | <sub_delims> |
|---|
| 88 | gen_delims ::= ':' | '/' | '?' | '#' | '[' | ']' | '@' |
|---|
| 89 | sub_delims ::= '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' |
|---|
| 90 | """ |
|---|
| 91 | |
|---|
| 92 | |
|---|
| 93 | class IRIGrammar(OMeta.makeGrammar(iriGrammar, globals())): |
|---|
| 94 | iprivateRanges = [(0x0000e000, 0x0000f8ff), |
|---|
| 95 | (0x000f0000, 0x000ffffd), |
|---|
| 96 | (0x00100000, 0x0010fffd)] |
|---|
| 97 | |
|---|
| 98 | ucscharRanges = [(0x000000a0, 0x0000d7ff), |
|---|
| 99 | (0x0000f900, 0x0000fdcf), |
|---|
| 100 | (0x0000fdf0, 0x0000ffef), |
|---|
| 101 | (0x00010000, 0x0001fffd), |
|---|
| 102 | (0x00020000, 0x0002fffd), |
|---|
| 103 | (0x00030000, 0x0003fffd), |
|---|
| 104 | (0x00040000, 0x0004fffd), |
|---|
| 105 | (0x00050000, 0x0005fffd), |
|---|
| 106 | (0x00060000, 0x0006fffd), |
|---|
| 107 | (0x00070000, 0x0007fffd), |
|---|
| 108 | (0x00080000, 0x0008fffd), |
|---|
| 109 | (0x00090000, 0x0009fffd), |
|---|
| 110 | (0x000a0000, 0x000afffd), |
|---|
| 111 | (0x000b0000, 0x000bfffd), |
|---|
| 112 | (0x000c0000, 0x000cfffd), |
|---|
| 113 | (0x000d0000, 0x000dfffd), |
|---|
| 114 | (0x000e0000, 0x000efffd)] |
|---|
| 115 | |
|---|
| 116 | def markURLStart(self): |
|---|
| 117 | self.urlStart = self.input.position |
|---|
| 118 | |
|---|
| 119 | def markURLEnd(self): |
|---|
| 120 | self.urlEnd = self.input.position |
|---|
| 121 | |
|---|
| 122 | |
|---|
| 123 | def extractURL(input): |
|---|
| 124 | g = IRIGrammar(input) |
|---|
| 125 | uri = g.apply('IRI') |
|---|
| 126 | return uri, g.urlEnd |
|---|
| 127 | |
|---|
| 128 | |
|---|
| 129 | def extractURLsWithPosition(input, supportedSchemes=None): |
|---|
| 130 | if supportedSchemes is None: |
|---|
| 131 | supportedSchemes = ['http'] |
|---|
| 132 | |
|---|
| 133 | for scheme in supportedSchemes: |
|---|
| 134 | pos = 0 |
|---|
| 135 | while True: |
|---|
| 136 | try: |
|---|
| 137 | pos = input.index(scheme, pos) |
|---|
| 138 | uri, inc = extractURL(input[pos:]) |
|---|
| 139 | pos += inc |
|---|
| 140 | yield uri, pos |
|---|
| 141 | except ParseError: |
|---|
| 142 | # Attempt to skip over the broken IRI. |
|---|
| 143 | pos += 1 |
|---|
| 144 | except ValueError: |
|---|
| 145 | break |
|---|
| 146 | |
|---|
| 147 | |
|---|
| 148 | def extractURLs(input, supportedSchemes=None): |
|---|
| 149 | for uri, _ in extractURLsWithPosition(input, supportedSchemes): |
|---|
| 150 | yield uri |
|---|
| 151 | |
|---|
| 152 | |
|---|
| 153 | def parseURL(input): |
|---|
| 154 | uri, = extractURL(input) |
|---|
| 155 | # TODO: Actually parse the URL components ourself. |
|---|
| 156 | return url.URL.fromString(uri) |
|---|
| 157 | |
|---|
| 158 | |
|---|
| 159 | def parseURLs(input, supportedSchemes=None): |
|---|
| 160 | for uri in extractURLs(input, supportedSchemes): |
|---|
| 161 | yield url.URL.fromString(uri) |
|---|