View Javadoc
1   /*******************************************************************************
2    * Copyright (c) 2015 ElasticSearch and MITRE, and others
3    * All rights reserved. This program and the accompanying materials
4    * are made available under the terms of the Apache License, Version 2.0 which
5    * accompanies this distribution and is available at
6    *    http://www.apache.org/licenses/LICENSE-2.0.txt
7    ******************************************************************************/
8   
9   // A derivative of commit 14bc4dee08355048d6a94e33834b919a3999a06e
10  //  at https://github.com/chrismale/elasticsearch
11  
12  package org.locationtech.spatial4j.io;
13  
14  
15  import org.locationtech.spatial4j.context.SpatialContext;
16  import org.locationtech.spatial4j.context.SpatialContextFactory;
17  import org.locationtech.spatial4j.exception.InvalidShapeException;
18  import org.locationtech.spatial4j.shape.Shape;
19  import org.locationtech.spatial4j.shape.ShapeFactory;
20  
21  import java.io.IOException;
22  import java.io.Reader;
23  import java.text.ParseException;
24  
25  /**
26   * An extensible parser for <a href="http://en.wikipedia.org/wiki/Well-known_text"> Well Known Text
27   * (WKT)</a>. The shapes supported by this class are:
28   * <ul>
29   * <li>POINT</li>
30   * <li>MULTIPOINT</li>
31   * <li>ENVELOPE (strictly isn't WKT but is defined by OGC's <a
32   * href="http://docs.geoserver.org/stable/en/user/tutorials/cql/cql_tutorial.html">Common Query
33   * Language (CQL)</a>)</li>
34   * <li>LINESTRING</li>
35   * <li>MULTILINESTRING</li>
36   * <LI>POLYGON</LI>
37   * <LI>MULTIPOLYGON</LI>
38   * <li>GEOMETRYCOLLECTION</li>
39   * <li>BUFFER (non-standard Spatial4j operation)</li>
40   * </ul>
41   * 'EMPTY' is supported. Specifying 'Z', 'M', or any other dimensionality in the WKT is effectively
42   * ignored. Thus, you can specify any number of numbers in the coordinate points but only the first
43   * two take effect. The javadocs for the <code>parse___Shape</code> methods further describe these
44   * shapes, or you
45   *
46   * <p>
47   * Most users of this class will call just one method: {@link #parse(String)}, or
48   * {@link #parseIfSupported(String)} to not fail if it isn't parse-able.
49   *
50   * <p>
51   * To support more shapes, extend this class and override
52   * {@link #parseShapeByType(WKTReader.State, String)}. It's also possible to delegate to a WKTParser
53   * by also delegating {@link #newState(String)}.
54   *
55   * <p>
56   * Note, instances of this base class are threadsafe.
57   */
58  public class WKTReader implements ShapeReader {
59    protected final SpatialContext ctx;
60    protected final ShapeFactory shapeFactory;
61  
62    // TODO support SRID: "SRID=4326;POINT(1,2)
63  
64    /**
65     * This constructor is required by
66     * {@link org.locationtech.spatial4j.context.SpatialContextFactory#makeFormats(SpatialContext)}.
67     */
68    public WKTReader(SpatialContext ctx, SpatialContextFactory factory) {
69      this.ctx = ctx;
70      this.shapeFactory = ctx.getShapeFactory();
71    }
72  
73  
74    /**
75     * Parses the wktString, returning the defined Shape.
76     *
77     * @return Non-null Shape defined in the String
78     * @throws ParseException Thrown if there is an error in the Shape definition
79     */
80    public Shape parse(String wktString) throws ParseException, InvalidShapeException {
81      Shape shape = parseIfSupported(wktString);// sets rawString & offset
82      if (shape != null)
83        return shape;
84      String shortenedString =
85          (wktString.length() <= 128 ? wktString : wktString.substring(0, 128 - 3) + "...");
86      throw new ParseException("Unknown Shape definition [" + shortenedString + "]", 0);
87    }
88  
89    /**
90     * Parses the wktString, returning the defined Shape. If it can't because the shape name is
91     * unknown or an empty or blank string was passed, then it returns null. If the WKT starts with a
92     * supported shape but contains an inner unsupported shape then it will result in a
93     * {@link ParseException}.
94     *
95     * @param wktString non-null, can be empty or have surrounding whitespace
96     * @return Shape, null if unknown / unsupported shape.
97     * @throws ParseException Thrown if there is an error in the Shape definition
98     */
99    public Shape parseIfSupported(String wktString) throws ParseException, InvalidShapeException {
100     State state = newState(wktString);
101     state.nextIfWhitespace();// leading
102     if (state.eof())
103       return null;
104     // shape types must start with a letter
105     if (!Character.isLetter(state.rawString.charAt(state.offset)))
106       return null;
107     String shapeType = state.nextWord();
108     Shape result = null;
109     try {
110       result = parseShapeByType(state, shapeType);
111     } catch (ParseException | InvalidShapeException e) {
112       throw e;
113     } catch (IllegalArgumentException e) { // NOTE: JTS Throws IAE for bad WKT
114       throw new InvalidShapeException(e.getMessage(), e);
115     } catch (Exception e) {
116       ParseException pe = new ParseException(e.toString(), state.offset);
117       pe.initCause(e);
118       throw pe;
119     }
120     if (result != null && !state.eof())
121       throw new ParseException("end of shape expected", state.offset);
122     return result;
123   }
124 
125   /**
126    * (internal) Creates a new State with the given String. It's only called by
127    * {@link #parseIfSupported(String)}. This is an extension point for subclassing.
128    */
129   protected State newState(String wktString) {
130     // NOTE: if we wanted to re-use old States to reduce object allocation, we might do that
131     // here. But in the scheme of things, it doesn't seem worth the bother as it complicates the
132     // thread-safety story of the API for too little of a gain.
133     return new State(wktString);
134   }
135 
136   /**
137    * (internal) Parses the remainder of a shape definition following the shape's name given as
138    * {@code shapeType} already consumed via {@link State#nextWord()}. If it's able to parse the
139    * shape, {@link WKTReader.State#offset} should be advanced beyond it (e.g. to the ',' or ')' or
140    * EOF in general). The default implementation checks the name against some predefined names and
141    * calls corresponding parse methods to handle the rest. Overriding this method is an excellent
142    * extension point for additional shape types. Or, use this class by delegation to this method.
143    * <p>
144    * When writing a parse method that reacts to a specific shape type, remember to handle the
145    * dimension and EMPTY token via
146    * {@link org.locationtech.spatial4j.io.WKTReader.State#nextIfEmptyAndSkipZM()}.
147    *
148    * @param shapeType Non-Null string; could have mixed case. The first character is a letter.
149    * @return The shape or null if not supported / unknown.
150    */
151   protected Shape parseShapeByType(State state, String shapeType) throws ParseException {
152     assert Character.isLetter(shapeType.charAt(0)) : "Shape must start with letter: " + shapeType;
153 
154     if (shapeType.equalsIgnoreCase("POINT")) {
155       return parsePointShape(state);
156     } else if (shapeType.equalsIgnoreCase("MULTIPOINT")) {
157       return parseMultiPointShape(state);
158     } else if (shapeType.equalsIgnoreCase("ENVELOPE")) {
159       return parseEnvelopeShape(state);
160     } else if (shapeType.equalsIgnoreCase("LINESTRING")) {
161       return parseLineStringShape(state);
162     } else if (shapeType.equalsIgnoreCase("POLYGON")) {
163       return parsePolygonShape(state);
164     } else if (shapeType.equalsIgnoreCase("GEOMETRYCOLLECTION")) {
165       return parseGeometryCollectionShape(state);
166     } else if (shapeType.equalsIgnoreCase("MULTILINESTRING")) {
167       return parseMultiLineStringShape(state);
168     } else if (shapeType.equalsIgnoreCase("MULTIPOLYGON")) {
169       return parseMulitPolygonShape(state);
170     }
171     // extension
172     if (shapeType.equalsIgnoreCase("BUFFER")) {
173       return parseBufferShape(state);
174     }
175 
176     // HEY! Update class Javadocs if add more shapes
177     return null;
178   }
179 
180   /**
181    * Parses the BUFFER operation applied to a parsed shape.
182    * 
183    * <pre>
184    *   '(' shape ',' number ')'
185    * </pre>
186    * 
187    * Whereas 'number' is the distance to buffer the shape by.
188    */
189   protected Shape parseBufferShape(State state) throws ParseException {
190     state.nextExpect('(');
191     Shape shape = shape(state);
192     state.nextExpect(',');
193     double distance = shapeFactory.normDist(state.nextDouble());
194     state.nextExpect(')');
195     return shape.getBuffered(distance, ctx);
196   }
197 
198   /**
199    * Parses a POINT shape from the raw string.
200    * 
201    * <pre>
202    *   '(' coordinate ')'
203    * </pre>
204    *
205    * @see #point(State, ShapeFactory.PointsBuilder)
206    */
207   protected Shape parsePointShape(State state) throws ParseException {
208     if (state.nextIfEmptyAndSkipZM())
209       return shapeFactory.pointXY(Double.NaN, Double.NaN);
210     state.nextExpect('(');
211     OnePointsBuilder onePointsBuilder = new OnePointsBuilder(shapeFactory);
212     point(state, onePointsBuilder);
213     state.nextExpect(')');
214     return onePointsBuilder.getPoint();
215   }
216 
217   /**
218    * Parses a MULTIPOINT shape from the raw string -- a collection of points.
219    * 
220    * <pre>
221    *   '(' coordinate (',' coordinate )* ')'
222    * </pre>
223    * 
224    * Furthermore, coordinate can optionally be wrapped in parenthesis.
225    *
226    * @see #point(State, ShapeFactory.PointsBuilder)
227    */
228   protected Shape parseMultiPointShape(State state) throws ParseException {
229     ShapeFactory.MultiPointBuilder builder = shapeFactory.multiPoint();
230     if (state.nextIfEmptyAndSkipZM())
231       return builder.build();
232     state.nextExpect('(');
233     do {
234       boolean openParen = state.nextIf('(');
235       point(state, builder);
236       if (openParen)
237         state.nextExpect(')');
238     } while (state.nextIf(','));
239     state.nextExpect(')');
240     return builder.build();
241   }
242 
243   /**
244    * Parses an ENVELOPE (aka Rectangle) shape from the raw string. The values are normalized.
245    * <p>
246    * Source: OGC "Catalogue Services Specification", the "CQL" (Common Query Language) sub-spec.
247    * <em>Note the inconsistent order of the min &amp; max values between x &amp; y!</em>
248    * 
249    * <pre>
250    *   '(' x1 ',' x2 ',' y2 ',' y1 ')'
251    * </pre>
252    */
253   protected Shape parseEnvelopeShape(State state) throws ParseException {
254     // FYI no dimension or EMPTY
255     state.nextExpect('(');
256     double x1 = state.nextDouble();
257     state.nextExpect(',');
258     double x2 = state.nextDouble();
259     state.nextExpect(',');
260     double y2 = state.nextDouble();
261     state.nextExpect(',');
262     double y1 = state.nextDouble();
263     state.nextExpect(')');
264     return shapeFactory.rect(shapeFactory.normX(x1), shapeFactory.normX(x2),
265             shapeFactory.normY(y1), shapeFactory.normY(y2));
266   }
267 
268   /**
269    * Parses a LINESTRING shape from the raw string -- an ordered sequence of points.
270    * 
271    * <pre>
272    * coordinateSequence
273    * </pre>
274    *
275    * @see #pointList(State, ShapeFactory.PointsBuilder)
276    */
277   protected Shape parseLineStringShape(State state) throws ParseException {
278     ShapeFactory.LineStringBuilder lineStringBuilder = shapeFactory.lineString();
279     if (state.nextIfEmptyAndSkipZM())
280       return lineStringBuilder.build();
281     return pointList(state, lineStringBuilder).build();
282   }
283 
284   /**
285    * Parses a MULTILINESTRING shape from the raw string -- a collection of line strings.
286    * 
287    * <pre>
288    *   '(' coordinateSequence (',' coordinateSequence )* ')'
289    * </pre>
290    *
291    * @see #parseLineStringShape(org.locationtech.spatial4j.io.WKTReader.State)
292    */
293   protected Shape parseMultiLineStringShape(State state) throws ParseException {
294     ShapeFactory.MultiLineStringBuilder multiLineStringBuilder = shapeFactory.multiLineString();
295     if (!state.nextIfEmptyAndSkipZM()) {
296       state.nextExpect('(');
297       do {
298         multiLineStringBuilder.add(pointList(state, multiLineStringBuilder.lineString()));
299       } while (state.nextIf(','));
300       state.nextExpect(')');
301     }
302     return multiLineStringBuilder.build();
303   }
304 
305   /**
306    * Parses a POLYGON shape from the raw string. It might return a
307    * {@link org.locationtech.spatial4j.shape.Rectangle} if the polygon is one.
308    *
309    * <pre>
310    * coordinateSequenceList
311    * </pre>
312    */
313   protected Shape parsePolygonShape(WKTReader.State state) throws ParseException {
314     ShapeFactory.PolygonBuilder polygonBuilder = shapeFactory.polygon();
315     if (!state.nextIfEmptyAndSkipZM()) {
316       polygonBuilder = polygon(state, polygonBuilder);
317     }
318     return polygonBuilder.buildOrRect();
319   }
320 
321   /**
322    * Parses a MULTIPOLYGON shape from the raw string.
323    *
324    * <pre>
325    *   '(' polygon (',' polygon )* ')'
326    * </pre>
327    */
328   protected Shape parseMulitPolygonShape(WKTReader.State state) throws ParseException {
329     ShapeFactory.MultiPolygonBuilder multiPolygonBuilder = shapeFactory.multiPolygon();
330     if (!state.nextIfEmptyAndSkipZM()) {
331       state.nextExpect('(');
332       do {
333         multiPolygonBuilder.add(polygon(state, multiPolygonBuilder.polygon()));
334       } while (state.nextIf(','));
335       state.nextExpect(')');
336     }
337     return multiPolygonBuilder.build();
338   }
339 
340   /**
341    * Parses a GEOMETRYCOLLECTION shape from the raw string.
342    * 
343    * <pre>
344    *   '(' shape (',' shape )* ')'
345    * </pre>
346    */
347   protected Shape parseGeometryCollectionShape(State state) throws ParseException {
348     ShapeFactory.MultiShapeBuilder<Shape> multiShapeBuilder = shapeFactory.multiShape(Shape.class);
349     if (state.nextIfEmptyAndSkipZM())
350       return multiShapeBuilder.build();
351     state.nextExpect('(');
352     do {
353       multiShapeBuilder.add(shape(state));
354     } while (state.nextIf(','));
355     state.nextExpect(')');
356     return multiShapeBuilder.build();
357   }
358 
359   /**
360    * Reads a shape from the current position, starting with the name of the shape. It calls
361    * {@link #parseShapeByType(org.locationtech.spatial4j.io.WKTReader.State, String)} and throws an
362    * exception if the shape wasn't supported.
363    */
364   protected Shape shape(State state) throws ParseException {
365     String type = state.nextWord();
366     Shape shape = parseShapeByType(state, type);
367     if (shape == null)
368       throw new ParseException("Shape of type " + type + " is unknown", state.offset);
369     return shape;
370   }
371 
372   /**
373    * Reads a list of Points (AKA CoordinateSequence) from the current position.
374    * 
375    * <pre>
376    *   '(' coordinate (',' coordinate )* ')'
377    * </pre>
378    *
379    * @see #point(State, ShapeFactory.PointsBuilder)
380    */
381   protected <B extends ShapeFactory.PointsBuilder> B pointList(State state, B pointsBuilder) throws ParseException {
382     state.nextExpect('(');
383     do {
384       point(state, pointsBuilder);
385     } while (state.nextIf(','));
386     state.nextExpect(')');
387     return pointsBuilder;
388   }
389 
390   /**
391    * Reads a raw Point (AKA Coordinate) from the current position. Only the first 2 numbers are
392    * used. The values are normalized.
393    * 
394    * <pre>
395    *   number number number*
396    * </pre>
397    */
398   protected ShapeFactory.PointsBuilder point(State state, ShapeFactory.PointsBuilder pointsBuilder) throws ParseException {
399     double x = state.nextDouble();
400     double y = state.nextDouble();
401     state.skipNextDoubles();//TODO capture to call pointXYZ
402     pointsBuilder.pointXY(shapeFactory.normX(x), shapeFactory.normY(y));
403     return pointsBuilder;
404   }
405 
406   /**
407    * Reads a polygon
408    */
409   protected ShapeFactory.PolygonBuilder polygon(WKTReader.State state, ShapeFactory.PolygonBuilder polygonBuilder) throws ParseException {
410     state.nextExpect('(');
411     pointList(state, polygonBuilder); // outer ring
412     while (state.nextIf(',')) {
413       ShapeFactory.PolygonBuilder.HoleBuilder holeBuilder = polygonBuilder.hole();
414       pointList(state, holeBuilder);
415       holeBuilder.endHole();
416     }
417     state.nextExpect(')');
418     return polygonBuilder;
419   }
420 
421   /** The parse state. */
422   public class State {
423     /** Set in {@link #parseIfSupported(String)}. */
424     public String rawString;
425     /** Offset of the next char in {@link #rawString} to be read. */
426     public int offset;
427     /** Dimensionality specifier (e.g. 'Z', or 'M') following a shape type name. */
428     public String dimension;
429 
430     public State(String rawString) {
431       this.rawString = rawString;
432     }
433 
434     public SpatialContext getCtx() {
435       return ctx;
436     }
437 
438     public WKTReader getParser() {
439       return WKTReader.this;
440     }
441 
442     /**
443      * Reads the word starting at the current character position. The word terminates once
444      * {@link Character#isJavaIdentifierPart(char)} returns false (or EOF). {@link #offset} is
445      * advanced past whitespace.
446      *
447      * @return Non-null non-empty String.
448      */
449     public String nextWord() throws ParseException {
450       int startOffset = offset;
451       while (offset < rawString.length()
452           && Character.isJavaIdentifierPart(rawString.charAt(offset))) {
453         offset++;
454       }
455       if (startOffset == offset)
456         throw new ParseException("Word expected", startOffset);
457       String result = rawString.substring(startOffset, offset);
458       nextIfWhitespace();
459       return result;
460     }
461 
462     /**
463      * Skips over a dimensionality token (e.g. 'Z' or 'M') if found, storing in {@link #dimension},
464      * and then looks for EMPTY, consuming that and whitespace.
465      * 
466      * <pre>
467      *   dimensionToken? 'EMPTY'?
468      * </pre>
469      * 
470      * @return True if EMPTY was found.
471      */
472     public boolean nextIfEmptyAndSkipZM() throws ParseException {
473       if (eof())
474         return false;
475       char c = rawString.charAt(offset);
476       if (c == '(' || !Character.isJavaIdentifierPart(c))
477         return false;
478       String word = nextWord();
479       if (word.equalsIgnoreCase("EMPTY"))
480         return true;
481       // we figure this word is Z or ZM or some other dimensionality signifier. We skip it.
482       this.dimension = word;
483 
484       if (eof())
485         return false;
486       c = rawString.charAt(offset);
487       if (c == '(' || !Character.isJavaIdentifierPart(c))
488         return false;
489       word = nextWord();
490       if (word.equalsIgnoreCase("EMPTY"))
491         return true;
492       throw new ParseException("Expected EMPTY because found dimension; but got [" + word + "]",
493           offset);
494     }
495 
496     /**
497      * Reads in a double from the String. Parses digits with an optional decimal, sign, or exponent.
498      * NaN and Infinity are not supported. {@link #offset} is advanced past whitespace.
499      *
500      * @return Double value
501      */
502     public double nextDouble() throws ParseException {
503       int startOffset = offset;
504       skipDouble();
505       if (startOffset == offset)
506         throw new ParseException("Expected a number", offset);
507       double result;
508       try {
509         result = Double.parseDouble(rawString.substring(startOffset, offset));
510       } catch (Exception e) {
511         throw new ParseException(e.toString(), offset);
512       }
513       nextIfWhitespace();
514       return result;
515     }
516 
517     /** Advances offset forward until it points to a character that isn't part of a number. */
518     public void skipDouble() {
519       int startOffset = offset;
520       for (; offset < rawString.length(); offset++) {
521         char c = rawString.charAt(offset);
522         if (!(Character.isDigit(c) || c == '.' || c == '-' || c == '+')) {
523           // 'e' is okay as long as it isn't first
524           if (offset != startOffset && (c == 'e' || c == 'E'))
525             continue;
526           break;
527         }
528       }
529     }
530 
531     /** Advances past as many doubles as there are, with intervening whitespace. */
532     public void skipNextDoubles() {
533       while (!eof()) {
534         int startOffset = offset;
535         skipDouble();
536         if (startOffset == offset)
537           return;
538         nextIfWhitespace();
539       }
540     }
541 
542     /**
543      * Verifies that the current character is of the expected value. If the character is the
544      * expected value, then it is consumed and {@link #offset} is advanced past whitespace.
545      *
546      * @param expected The expected char.
547      */
548     public void nextExpect(char expected) throws ParseException {
549       if (eof())
550         throw new ParseException("Expected [" + expected + "] found EOF", offset);
551       char c = rawString.charAt(offset);
552       if (c != expected)
553         throw new ParseException("Expected [" + expected + "] found [" + c + "]", offset);
554       offset++;
555       nextIfWhitespace();
556     }
557 
558     /** If the string is consumed, i.e. at end-of-file. */
559     public final boolean eof() {
560       return offset >= rawString.length();
561     }
562 
563     /**
564      * If the current character is {@code expected}, then offset is advanced after it and any
565      * subsequent whitespace. Otherwise, false is returned.
566      *
567      * @param expected The expected char
568      * @return true if consumed
569      */
570     public boolean nextIf(char expected) {
571       if (!eof() && rawString.charAt(offset) == expected) {
572         offset++;
573         nextIfWhitespace();
574         return true;
575       }
576       return false;
577     }
578 
579     /**
580      * Moves offset to next non-whitespace character. Doesn't move if the offset is already at
581      * non-whitespace. <em>There is very little reason for subclasses to call this because
582      * most other parsing methods call it.</em>
583      */
584     public void nextIfWhitespace() {
585       for (; offset < rawString.length(); offset++) {
586         if (!Character.isWhitespace(rawString.charAt(offset))) {
587           return;
588         }
589       }
590     }
591 
592     /**
593      * Returns the next chunk of text till the next ',' or ')' (non-inclusive) or EOF. If a '(' is
594      * encountered, then it looks past its matching ')', taking care to handle nested matching
595      * parenthesis too. It's designed to be of use to subclasses that wish to get the entire
596      * subshape at the current position as a string so that it might be passed to other software
597      * that will parse it.
598      * <p>
599      * Example:
600      * 
601      * <pre>
602      * OUTER(INNER(3, 5))
603      * </pre>
604      * 
605      * If this is called when offset is at the first character, then it will return this whole
606      * string. If called at the "I" then it will return "INNER(3, 5)". If called at "3", then it
607      * will return "3". In all cases, offset will be positioned at the next position following the
608      * returned substring.
609      *
610      * @return non-null substring.
611      */
612     public String nextSubShapeString() throws ParseException {
613       int startOffset = offset;
614       int parenStack = 0;// how many parenthesis levels are we in?
615       for (; offset < rawString.length(); offset++) {
616         char c = rawString.charAt(offset);
617         if (c == ',') {
618           if (parenStack == 0)
619             break;
620         } else if (c == ')') {
621           if (parenStack == 0)
622             break;
623           parenStack--;
624         } else if (c == '(') {
625           parenStack++;
626         }
627       }
628       if (parenStack != 0)
629         throw new ParseException("Unbalanced parenthesis", startOffset);
630       return rawString.substring(startOffset, offset);
631     }
632 
633   }// class State
634 
635   @Override
636   public String getFormatName() {
637     return ShapeIO.WKT;
638   }
639   
640   static String readString(Reader reader) throws IOException {
641     char[] arr = new char[1024];
642     StringBuilder buffer = new StringBuilder();
643     int numCharsRead;
644     while ((numCharsRead = reader.read(arr, 0, arr.length)) != -1) {
645       buffer.append(arr, 0, numCharsRead);
646     }
647     return buffer.toString();
648   }
649 
650   @Override
651   public Shape read(Reader reader) throws IOException, ParseException {
652     return parse(readString(reader));
653   }
654 
655   @Override
656   public Shape read(Object value) throws IOException, ParseException, InvalidShapeException {
657     return parse(value.toString());
658   }
659 
660   @Override
661   public Shape readIfSupported(Object value) throws InvalidShapeException {
662     try {
663       return parseIfSupported(value.toString());
664     } catch (ParseException e) {
665     }
666     return null;
667   }
668 }