back to Reference (Gold) summary
Reference (Gold): geopandas
Pytest Summary for test tests
status | count |
---|---|
passed | 1517 |
xfailed | 6 |
skipped | 65 |
xpassed | 2 |
total | 1590 |
collected | 1590 |
Failed pytests:
test_extension_array.py::TestGetitem::test_getitem_series_integer_with_missing_raises[list]
test_extension_array.py::TestGetitem::test_getitem_series_integer_with_missing_raises[list]
[gw4] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_extension_array.py::TestGetitem::test_getitem_series_integer_with_missing_raises[integer-array]
test_extension_array.py::TestGetitem::test_getitem_series_integer_with_missing_raises[integer-array]
[gw4] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_extension_array.py::TestSetitem::test_setitem_integer_with_missing_raises[list-True]
test_extension_array.py::TestSetitem::test_setitem_integer_with_missing_raises[list-True]
[gw4] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_op_output_types.py::test_loc_add_row[geom]
test_op_output_types.py::test_loc_add_row[geom]
[gw0] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_pandas_methods.py::test_drop_duplicates_series
test_pandas_methods.py::test_drop_duplicates_series
[gw3] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_pandas_methods.py::test_drop_duplicates_frame
test_pandas_methods.py::test_drop_duplicates_frame
[gw3] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_sindex.py::TestShapelyInterface::test_query_sorting[False-expected1]
test_sindex.py::TestShapelyInterface::test_query_sorting[False-expected1]
[gw3] linux -- Python 3.10.12 /testbed/.venv/bin/python3
test_sindex.py::TestShapelyInterface::test_query_bulk_sorting[False-expected1]
test_sindex.py::TestShapelyInterface::test_query_bulk_sorting[False-expected1]
[gw3] linux -- Python 3.10.12 /testbed/.venv/bin/python3
Patch diff
diff --git a/geopandas/_compat.py b/geopandas/_compat.py
index 2c7e74f0..3d582bdc 100644
--- a/geopandas/_compat.py
+++ b/geopandas/_compat.py
@@ -1,21 +1,35 @@
import importlib
from packaging.version import Version
+
import pandas as pd
+
import shapely
import shapely.geos
-PANDAS_GE_14 = Version(pd.__version__) >= Version('1.4.0rc0')
-PANDAS_GE_15 = Version(pd.__version__) >= Version('1.5.0')
-PANDAS_GE_20 = Version(pd.__version__) >= Version('2.0.0')
-PANDAS_GE_202 = Version(pd.__version__) >= Version('2.0.2')
-PANDAS_GE_21 = Version(pd.__version__) >= Version('2.1.0')
-PANDAS_GE_22 = Version(pd.__version__) >= Version('2.2.0')
-PANDAS_GE_30 = Version(pd.__version__) >= Version('3.0.0.dev0')
-SHAPELY_GE_204 = Version(shapely.__version__) >= Version('2.0.4')
+
+# -----------------------------------------------------------------------------
+# pandas compat
+# -----------------------------------------------------------------------------
+
+PANDAS_GE_14 = Version(pd.__version__) >= Version("1.4.0rc0")
+PANDAS_GE_15 = Version(pd.__version__) >= Version("1.5.0")
+PANDAS_GE_20 = Version(pd.__version__) >= Version("2.0.0")
+PANDAS_GE_202 = Version(pd.__version__) >= Version("2.0.2")
+PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
+PANDAS_GE_22 = Version(pd.__version__) >= Version("2.2.0")
+PANDAS_GE_30 = Version(pd.__version__) >= Version("3.0.0.dev0")
+
+
+# -----------------------------------------------------------------------------
+# Shapely / GEOS compat
+# -----------------------------------------------------------------------------
+
+SHAPELY_GE_204 = Version(shapely.__version__) >= Version("2.0.4")
+
GEOS_GE_390 = shapely.geos.geos_version >= (3, 9, 0)
GEOS_GE_310 = shapely.geos.geos_version >= (3, 10, 0)
-def import_optional_dependency(name: str, extra: str=''):
+def import_optional_dependency(name: str, extra: str = ""):
"""
Import an optional dependency.
@@ -33,12 +47,46 @@ def import_optional_dependency(name: str, extra: str=''):
-------
module
"""
- pass
+ msg = """Missing optional dependency '{name}'. {extra} "
+ "Use pip or conda to install {name}.""".format(
+ name=name, extra=extra
+ )
+
+ if not isinstance(name, str):
+ raise ValueError(
+ "Invalid module name: '{name}'; must be a string".format(name=name)
+ )
+ try:
+ module = importlib.import_module(name)
+ except ImportError:
+ raise ImportError(msg) from None
+
+ return module
+
+
+# -----------------------------------------------------------------------------
+# pyproj compat
+# -----------------------------------------------------------------------------
try:
- import pyproj
+ import pyproj # noqa: F401
+
HAS_PYPROJ = True
+
except ImportError as err:
HAS_PYPROJ = False
pyproj_import_error = str(err)
+
+
+def requires_pyproj(func):
+ def wrapper(*args, **kwargs):
+ if not HAS_PYPROJ:
+ raise ImportError(
+ f"The 'pyproj' package is required for {func.__name__} to work. "
+ "Install it and initialize the object with a CRS before using it."
+ f"\nImporting pyproj resulted in: {pyproj_import_error}"
+ )
+ return func(*args, **kwargs)
+
+ return wrapper
diff --git a/geopandas/_config.py b/geopandas/_config.py
index a27f8854..d92882a7 100644
--- a/geopandas/_config.py
+++ b/geopandas/_config.py
@@ -5,23 +5,28 @@ Based on https://github.com/topper-123/optioneer, but simplified (don't deal
with nested options, deprecated options, ..), just the attribute-style dict
like holding the options and giving a nice repr.
"""
+
import textwrap
import warnings
from collections import namedtuple
-Option = namedtuple('Option', 'key default_value doc validator callback')
+
+Option = namedtuple("Option", "key default_value doc validator callback")
class Options(object):
"""Provide attribute-style access to configuration dict."""
def __init__(self, options):
- super().__setattr__('_options', options)
+ super().__setattr__("_options", options)
+ # populate with default values
config = {}
for key, option in options.items():
config[key] = option.default_value
- super().__setattr__('_config', config)
+
+ super().__setattr__("_config", config)
def __setattr__(self, key, value):
+ # you can't set new keys
if key in self._config:
option = self._options[key]
if option.validator:
@@ -30,45 +35,99 @@ class Options(object):
if option.callback:
option.callback(key, value)
else:
- msg = 'You can only set the value of existing options'
+ msg = "You can only set the value of existing options"
raise AttributeError(msg)
def __getattr__(self, key):
try:
return self._config[key]
except KeyError:
- raise AttributeError('No such option')
+ raise AttributeError("No such option")
def __dir__(self):
return list(self._config.keys())
def __repr__(self):
cls = self.__class__.__name__
- description = ''
+ description = ""
for key, option in self._options.items():
- descr = '{key}: {cur!r} [default: {default!r}]\n'.format(key=
- key, cur=self._config[key], default=option.default_value)
+ descr = "{key}: {cur!r} [default: {default!r}]\n".format(
+ key=key, cur=self._config[key], default=option.default_value
+ )
description += descr
+
if option.doc:
- doc_text = '\n'.join(textwrap.wrap(option.doc, width=70))
+ doc_text = "\n".join(textwrap.wrap(option.doc, width=70))
else:
- doc_text = 'No description available.'
- doc_text = textwrap.indent(doc_text, prefix=' ')
- description += doc_text + '\n'
- space = '\n '
- description = description.replace('\n', space)
- return '{}({}{})'.format(cls, space, description)
-
-
-display_precision = Option(key='display_precision', default_value=None, doc
- =
- 'The precision (maximum number of decimals) of the coordinates in the WKT representation in the Series/DataFrame display. By default (None), it tries to infer and use 3 decimals for projected coordinates and 5 decimals for geographic coordinates.'
- , validator=_validate_display_precision, callback=None)
-io_engine = Option(key='io_engine', default_value=None, doc=
- "The default engine for ``read_file`` and ``to_file``. Options are 'pyogrio' and 'fiona'."
- , validator=_validate_io_engine, callback=None)
-use_pygeos = Option(key='use_pygeos', default_value=False, doc=
- 'Deprecated option previously used to enable PyGEOS. It will be removed in GeoPandas 1.1.'
- , validator=_warn_use_pygeos_deprecated, callback=None)
-options = Options({'display_precision': display_precision, 'use_pygeos':
- use_pygeos, 'io_engine': io_engine})
+ doc_text = "No description available."
+ doc_text = textwrap.indent(doc_text, prefix=" ")
+ description += doc_text + "\n"
+ space = "\n "
+ description = description.replace("\n", space)
+ return "{}({}{})".format(cls, space, description)
+
+
+def _validate_display_precision(value):
+ if value is not None:
+ if not isinstance(value, int) or not (0 <= value <= 16):
+ raise ValueError("Invalid value, needs to be an integer [0-16]")
+
+
+display_precision = Option(
+ key="display_precision",
+ default_value=None,
+ doc=(
+ "The precision (maximum number of decimals) of the coordinates in "
+ "the WKT representation in the Series/DataFrame display. "
+ "By default (None), it tries to infer and use 3 decimals for projected "
+ "coordinates and 5 decimals for geographic coordinates."
+ ),
+ validator=_validate_display_precision,
+ callback=None,
+)
+
+
+def _warn_use_pygeos_deprecated(_value):
+ warnings.warn(
+ "pygeos support was removed in 1.0. "
+ "geopandas.use_pygeos is a no-op and will be removed in geopandas 1.1.",
+ stacklevel=3,
+ )
+
+
+def _validate_io_engine(value):
+ if value is not None:
+ if value not in ("pyogrio", "fiona"):
+ raise ValueError(f"Expected 'pyogrio' or 'fiona', got '{value}'")
+
+
+io_engine = Option(
+ key="io_engine",
+ default_value=None,
+ doc=(
+ "The default engine for ``read_file`` and ``to_file``. "
+ "Options are 'pyogrio' and 'fiona'."
+ ),
+ validator=_validate_io_engine,
+ callback=None,
+)
+
+# TODO: deprecate this
+use_pygeos = Option(
+ key="use_pygeos",
+ default_value=False,
+ doc=(
+ "Deprecated option previously used to enable PyGEOS. "
+ "It will be removed in GeoPandas 1.1."
+ ),
+ validator=_warn_use_pygeos_deprecated,
+ callback=None,
+)
+
+options = Options(
+ {
+ "display_precision": display_precision,
+ "use_pygeos": use_pygeos,
+ "io_engine": io_engine,
+ }
+)
diff --git a/geopandas/_decorator.py b/geopandas/_decorator.py
index d242f705..dee8e17c 100644
--- a/geopandas/_decorator.py
+++ b/geopandas/_decorator.py
@@ -1,8 +1,11 @@
from textwrap import dedent
from typing import Callable, Union
+# doc decorator function ported with modifications from Pandas
+# https://github.com/pandas-dev/pandas/blob/master/pandas/util/_decorators.py
-def doc(*docstrings: Union[str, Callable], **params) ->Callable:
+
+def doc(*docstrings: Union[str, Callable], **params) -> Callable:
"""
A decorator take docstring templates, concatenate them and perform string
substitution on it.
@@ -20,4 +23,30 @@ def doc(*docstrings: Union[str, Callable], **params) ->Callable:
**params
The string which would be used to format docstring template.
"""
- pass
+
+ def decorator(decorated: Callable) -> Callable:
+ # collecting docstring and docstring templates
+ docstring_components: list[Union[str, Callable]] = []
+ if decorated.__doc__:
+ docstring_components.append(dedent(decorated.__doc__))
+
+ for docstring in docstrings:
+ if hasattr(docstring, "_docstring_components"):
+ docstring_components.extend(docstring._docstring_components)
+ elif isinstance(docstring, str) or docstring.__doc__:
+ docstring_components.append(docstring)
+
+ # formatting templates and concatenating docstring
+ decorated.__doc__ = "".join(
+ (
+ component.format(**params)
+ if isinstance(component, str)
+ else dedent(component.__doc__ or "")
+ )
+ for component in docstring_components
+ )
+
+ decorated._docstring_components = docstring_components
+ return decorated
+
+ return decorator
diff --git a/geopandas/_version.py b/geopandas/_version.py
index 61aaa9f6..4639f7c3 100644
--- a/geopandas/_version.py
+++ b/geopandas/_version.py
@@ -1,4 +1,15 @@
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain.
+# Generated by versioneer-0.29
+# https://github.com/python-versioneer/python-versioneer
+
"""Git implementation of _version.py."""
+
import errno
import os
import re
@@ -8,13 +19,22 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
import functools
-def get_keywords() ->Dict[str, str]:
+def get_keywords() -> Dict[str, str]:
"""Get the keywords needed to look up the version information."""
- pass
+ # these strings will be replaced by git during git-archive.
+ # setup.py/versioneer.py will grep for the variable names, so they must
+ # each be defined on a line of their own. _version.py will just call
+ # get_keywords().
+ git_refnames = "$Format:%d$"
+ git_full = "$Format:%H$"
+ git_date = "$Format:%ci$"
+ keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+ return keywords
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
+
VCS: str
style: str
tag_prefix: str
@@ -23,9 +43,18 @@ class VersioneerConfig:
verbose: bool
-def get_config() ->VersioneerConfig:
+def get_config() -> VersioneerConfig:
"""Create, populate and return the VersioneerConfig() object."""
- pass
+ # these strings are filled in when 'setup.py versioneer' creates
+ # _version.py
+ cfg = VersioneerConfig()
+ cfg.VCS = "git"
+ cfg.style = "pep440"
+ cfg.tag_prefix = "v"
+ cfg.parentdir_prefix = "geopandas-"
+ cfg.versionfile_source = "geopandas/_version.py"
+ cfg.verbose = False
+ return cfg
class NotThisMethod(Exception):
@@ -36,60 +65,359 @@ LONG_VERSION_PY: Dict[str, str] = {}
HANDLERS: Dict[str, Dict[str, Callable]] = {}
-def register_vcs_handler(vcs: str, method: str) ->Callable:
+def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
"""Create decorator to mark a method as the handler of a VCS."""
- pass
+ def decorate(f: Callable) -> Callable:
+ """Store f in HANDLERS[vcs][method]."""
+ if vcs not in HANDLERS:
+ HANDLERS[vcs] = {}
+ HANDLERS[vcs][method] = f
+ return f
-def run_command(commands: List[str], args: List[str], cwd: Optional[str]=
- None, verbose: bool=False, hide_stderr: bool=False, env: Optional[Dict[
- str, str]]=None) ->Tuple[Optional[str], Optional[int]]:
- """Call the given command(s)."""
- pass
+ return decorate
-def versions_from_parentdir(parentdir_prefix: str, root: str, verbose: bool
- ) ->Dict[str, Any]:
+def run_command(
+ commands: List[str],
+ args: List[str],
+ cwd: Optional[str] = None,
+ verbose: bool = False,
+ hide_stderr: bool = False,
+ env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
+ """Call the given command(s)."""
+ assert isinstance(commands, list)
+ process = None
+
+ popen_kwargs: Dict[str, Any] = {}
+ if sys.platform == "win32":
+ # This hides the console window if pythonw.exe is used
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ popen_kwargs["startupinfo"] = startupinfo
+
+ for command in commands:
+ try:
+ dispcmd = str([command] + args)
+ # remember shell=False, so use git.cmd on windows, not just git
+ process = subprocess.Popen(
+ [command] + args,
+ cwd=cwd,
+ env=env,
+ stdout=subprocess.PIPE,
+ stderr=(subprocess.PIPE if hide_stderr else None),
+ **popen_kwargs,
+ )
+ break
+ except OSError as e:
+ if e.errno == errno.ENOENT:
+ continue
+ if verbose:
+ print("unable to run %s" % dispcmd)
+ print(e)
+ return None, None
+ else:
+ if verbose:
+ print("unable to find command, tried %s" % (commands,))
+ return None, None
+ stdout = process.communicate()[0].strip().decode()
+ if process.returncode != 0:
+ if verbose:
+ print("unable to run %s (error)" % dispcmd)
+ print("stdout was %s" % stdout)
+ return None, process.returncode
+ return stdout, process.returncode
+
+
+def versions_from_parentdir(
+ parentdir_prefix: str,
+ root: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
"""
- pass
-
-
-@register_vcs_handler('git', 'get_keywords')
-def git_get_keywords(versionfile_abs: str) ->Dict[str, str]:
+ rootdirs = []
+
+ for _ in range(3):
+ dirname = os.path.basename(root)
+ if dirname.startswith(parentdir_prefix):
+ return {
+ "version": dirname[len(parentdir_prefix) :],
+ "full-revisionid": None,
+ "dirty": False,
+ "error": None,
+ "date": None,
+ }
+ rootdirs.append(root)
+ root = os.path.dirname(root) # up a level
+
+ if verbose:
+ print(
+ "Tried directories %s but none started with prefix %s"
+ % (str(rootdirs), parentdir_prefix)
+ )
+ raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
- pass
-
-
-@register_vcs_handler('git', 'keywords')
-def git_versions_from_keywords(keywords: Dict[str, str], tag_prefix: str,
- verbose: bool) ->Dict[str, Any]:
+ # the code embedded in _version.py can just fetch the value of these
+ # keywords. When used from setup.py, we don't want to import _version.py,
+ # so we do it with a regexp instead. This function is not used from
+ # _version.py.
+ keywords: Dict[str, str] = {}
+ try:
+ with open(versionfile_abs, "r") as fobj:
+ for line in fobj:
+ if line.strip().startswith("git_refnames ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["refnames"] = mo.group(1)
+ if line.strip().startswith("git_full ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["full"] = mo.group(1)
+ if line.strip().startswith("git_date ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["date"] = mo.group(1)
+ except OSError:
+ pass
+ return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(
+ keywords: Dict[str, str],
+ tag_prefix: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Get version information from git keywords."""
- pass
-
-
-@register_vcs_handler('git', 'pieces_from_vcs')
-def git_pieces_from_vcs(tag_prefix: str, root: str, verbose: bool, runner:
- Callable=run_command) ->Dict[str, Any]:
+ if "refnames" not in keywords:
+ raise NotThisMethod("Short version file found")
+ date = keywords.get("date")
+ if date is not None:
+ # Use only the last line. Previous lines may contain GPG signature
+ # information.
+ date = date.splitlines()[-1]
+
+ # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+ # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+ # -like" string, which we must then edit to make compliant), because
+ # it's been around since git-1.5.3, and it's too difficult to
+ # discover which version we're using, or to work around using an
+ # older one.
+ date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+ refnames = keywords["refnames"].strip()
+ if refnames.startswith("$Format"):
+ if verbose:
+ print("keywords are unexpanded, not using")
+ raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+ refs = {r.strip() for r in refnames.strip("()").split(",")}
+ # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+ # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+ TAG = "tag: "
+ tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
+ if not tags:
+ # Either we're using git < 1.8.3, or there really are no tags. We use
+ # a heuristic: assume all version tags have a digit. The old git %d
+ # expansion behaves like git log --decorate=short and strips out the
+ # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+ # between branches and tags. By ignoring refnames without digits, we
+ # filter out many common branch names like "release" and
+ # "stabilization", as well as "HEAD" and "master".
+ tags = {r for r in refs if re.search(r"\d", r)}
+ if verbose:
+ print("discarding '%s', no digits" % ",".join(refs - tags))
+ if verbose:
+ print("likely tags: %s" % ",".join(sorted(tags)))
+ for ref in sorted(tags):
+ # sorting will prefer e.g. "2.0" over "2.0rc1"
+ if ref.startswith(tag_prefix):
+ r = ref[len(tag_prefix) :]
+ # Filter out refs that exactly match prefix or that don't start
+ # with a number once the prefix is stripped (mostly a concern
+ # when prefix is '')
+ if not re.match(r"\d", r):
+ continue
+ if verbose:
+ print("picking %s" % r)
+ return {
+ "version": r,
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": None,
+ "date": date,
+ }
+ # no suitable tags, so version is "0+unknown", but full hex is still there
+ if verbose:
+ print("no suitable tags, using unknown + full revision id")
+ return {
+ "version": "0+unknown",
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": "no suitable tags",
+ "date": None,
+ }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(
+ tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command
+) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
- pass
-
-
-def plus_or_dot(pieces: Dict[str, Any]) ->str:
+ GITS = ["git"]
+ if sys.platform == "win32":
+ GITS = ["git.cmd", "git.exe"]
+
+ # GIT_DIR can interfere with correct operation of Versioneer.
+ # It may be intended to be passed to the Versioneer-versioned project,
+ # but that should not change where we get our version from.
+ env = os.environ.copy()
+ env.pop("GIT_DIR", None)
+ runner = functools.partial(runner, env=env)
+
+ _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
+ if rc != 0:
+ if verbose:
+ print("Directory %s not under git control" % root)
+ raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+ # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+ # if there isn't one, this yields HEX[-dirty] (no NUM)
+ describe_out, rc = runner(
+ GITS,
+ [
+ "describe",
+ "--tags",
+ "--dirty",
+ "--always",
+ "--long",
+ "--match",
+ f"{tag_prefix}[[:digit:]]*",
+ ],
+ cwd=root,
+ )
+ # --long was added in git-1.5.5
+ if describe_out is None:
+ raise NotThisMethod("'git describe' failed")
+ describe_out = describe_out.strip()
+ full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
+ if full_out is None:
+ raise NotThisMethod("'git rev-parse' failed")
+ full_out = full_out.strip()
+
+ pieces: Dict[str, Any] = {}
+ pieces["long"] = full_out
+ pieces["short"] = full_out[:7] # maybe improved later
+ pieces["error"] = None
+
+ branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
+ # --abbrev-ref was added in git-1.6.3
+ if rc != 0 or branch_name is None:
+ raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+ branch_name = branch_name.strip()
+
+ if branch_name == "HEAD":
+ # If we aren't exactly on a branch, pick a branch which represents
+ # the current commit. If all else fails, we are on a branchless
+ # commit.
+ branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+ # --contains was added in git-1.5.4
+ if rc != 0 or branches is None:
+ raise NotThisMethod("'git branch --contains' returned error")
+ branches = branches.split("\n")
+
+ # Remove the first line if we're running detached
+ if "(" in branches[0]:
+ branches.pop(0)
+
+ # Strip off the leading "* " from the list of branches.
+ branches = [branch[2:] for branch in branches]
+ if "master" in branches:
+ branch_name = "master"
+ elif not branches:
+ branch_name = None
+ else:
+ # Pick the first branch that is returned. Good or bad.
+ branch_name = branches[0]
+
+ pieces["branch"] = branch_name
+
+ # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+ # TAG might have hyphens.
+ git_describe = describe_out
+
+ # look for -dirty suffix
+ dirty = git_describe.endswith("-dirty")
+ pieces["dirty"] = dirty
+ if dirty:
+ git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+ # now we have TAG-NUM-gHEX or HEX
+
+ if "-" in git_describe:
+ # TAG-NUM-gHEX
+ mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+ if not mo:
+ # unparsable. Maybe git-describe is misbehaving?
+ pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
+ return pieces
+
+ # tag
+ full_tag = mo.group(1)
+ if not full_tag.startswith(tag_prefix):
+ if verbose:
+ fmt = "tag '%s' doesn't start with prefix '%s'"
+ print(fmt % (full_tag, tag_prefix))
+ pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+ full_tag,
+ tag_prefix,
+ )
+ return pieces
+ pieces["closest-tag"] = full_tag[len(tag_prefix) :]
+
+ # distance: number of commits since tag
+ pieces["distance"] = int(mo.group(2))
+
+ # commit: short hex revision ID
+ pieces["short"] = mo.group(3)
+
+ else:
+ # HEX: no tags
+ pieces["closest-tag"] = None
+ out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+ pieces["distance"] = len(out.split()) # total number of commits
+
+ # commit date: see ISO-8601 comment in git_versions_from_keywords()
+ date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+ # Use only the last line. Previous lines may contain GPG signature
+ # information.
+ date = date.splitlines()[-1]
+ pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+ return pieces
+
+
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
- pass
+ if "+" in pieces.get("closest-tag", ""):
+ return "."
+ return "+"
-def render_pep440(pieces: Dict[str, Any]) ->str:
+def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
@@ -98,10 +426,22 @@ def render_pep440(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
- pass
-
-
-def render_pep440_branch(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += plus_or_dot(pieces)
+ rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
"""TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
The ".dev0" means not master branch. Note that .dev0 sorts backwards
@@ -110,28 +450,61 @@ def render_pep440_branch(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
"""
- pass
-
-
-def pep440_split_post(ver: str) ->Tuple[str, Optional[int]]:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0"
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
"""Split pep440 version string at the post-release segment.
Returns the release segments before the post-release and the
post-release version number (or -1 if no post-release segment is present).
"""
- pass
+ vc = str.split(ver, ".post")
+ return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-def render_pep440_pre(pieces: Dict[str, Any]) ->str:
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
"""TAG[.postN.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post0.devDISTANCE
"""
- pass
-
-
-def render_pep440_post(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ if pieces["distance"]:
+ # update the post release segment
+ tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+ rendered = tag_version
+ if post_version is not None:
+ rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+ else:
+ rendered += ".post0.dev%d" % (pieces["distance"])
+ else:
+ # no commits, use the tag as the version
+ rendered = pieces["closest-tag"]
+ else:
+ # exception #1
+ rendered = "0.post0.dev%d" % pieces["distance"]
+ return rendered
+
+
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
@@ -141,10 +514,24 @@ def render_pep440_post(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
- pass
-
-
-def render_pep440_post_branch(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "g%s" % pieces["short"]
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ rendered += "+g%s" % pieces["short"]
+ return rendered
+
+
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
The ".dev0" means not master branch.
@@ -152,10 +539,28 @@ def render_pep440_post_branch(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
"""
- pass
-
-
-def render_pep440_old(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "g%s" % pieces["short"]
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += "+g%s" % pieces["short"]
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
@@ -163,10 +568,21 @@ def render_pep440_old(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
- pass
-
-
-def render_git_describe(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["dirty"]:
+ rendered += ".dev0"
+ return rendered
+
+
+def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
@@ -174,10 +590,19 @@ def render_git_describe(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
- pass
-
-
-def render_git_describe_long(pieces: Dict[str, Any]) ->str:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"]:
+ rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+ else:
+ # exception #1
+ rendered = pieces["short"]
+ if pieces["dirty"]:
+ rendered += "-dirty"
+ return rendered
+
+
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
@@ -186,14 +611,106 @@ def render_git_describe_long(pieces: Dict[str, Any]) ->str:
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
- pass
-
-
-def render(pieces: Dict[str, Any], style: str) ->Dict[str, Any]:
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+ else:
+ # exception #1
+ rendered = pieces["short"]
+ if pieces["dirty"]:
+ rendered += "-dirty"
+ return rendered
+
+
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
- pass
-
-
-def get_versions() ->Dict[str, Any]:
+ if pieces["error"]:
+ return {
+ "version": "unknown",
+ "full-revisionid": pieces.get("long"),
+ "dirty": None,
+ "error": pieces["error"],
+ "date": None,
+ }
+
+ if not style or style == "default":
+ style = "pep440" # the default
+
+ if style == "pep440":
+ rendered = render_pep440(pieces)
+ elif style == "pep440-branch":
+ rendered = render_pep440_branch(pieces)
+ elif style == "pep440-pre":
+ rendered = render_pep440_pre(pieces)
+ elif style == "pep440-post":
+ rendered = render_pep440_post(pieces)
+ elif style == "pep440-post-branch":
+ rendered = render_pep440_post_branch(pieces)
+ elif style == "pep440-old":
+ rendered = render_pep440_old(pieces)
+ elif style == "git-describe":
+ rendered = render_git_describe(pieces)
+ elif style == "git-describe-long":
+ rendered = render_git_describe_long(pieces)
+ else:
+ raise ValueError("unknown style '%s'" % style)
+
+ return {
+ "version": rendered,
+ "full-revisionid": pieces["long"],
+ "dirty": pieces["dirty"],
+ "error": None,
+ "date": pieces.get("date"),
+ }
+
+
+def get_versions() -> Dict[str, Any]:
"""Get version information or return default if unable to do so."""
- pass
+ # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+ # __file__, we can work backwards from there to the root. Some
+ # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+ # case we can only use expanded keywords.
+
+ cfg = get_config()
+ verbose = cfg.verbose
+
+ try:
+ return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
+ except NotThisMethod:
+ pass
+
+ try:
+ root = os.path.realpath(__file__)
+ # versionfile_source is the relative path from the top of the source
+ # tree (where the .git directory might live) to this file. Invert
+ # this to find the root from __file__.
+ for _ in cfg.versionfile_source.split("/"):
+ root = os.path.dirname(root)
+ except NameError:
+ return {
+ "version": "0+unknown",
+ "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to find root of source tree",
+ "date": None,
+ }
+
+ try:
+ pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+ return render(pieces, cfg.style)
+ except NotThisMethod:
+ pass
+
+ try:
+ if cfg.parentdir_prefix:
+ return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+ except NotThisMethod:
+ pass
+
+ return {
+ "version": "0+unknown",
+ "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to compute version",
+ "date": None,
+ }
diff --git a/geopandas/array.py b/geopandas/array.py
index 3f1cc54a..32f338a7 100644
--- a/geopandas/array.py
+++ b/geopandas/array.py
@@ -3,24 +3,44 @@ import numbers
import operator
import warnings
from functools import lru_cache
+
import numpy as np
import pandas as pd
-from pandas.api.extensions import ExtensionArray, ExtensionDtype, register_extension_dtype
+from pandas.api.extensions import (
+ ExtensionArray,
+ ExtensionDtype,
+ register_extension_dtype,
+)
+
import shapely
import shapely.affinity
import shapely.geometry
import shapely.ops
import shapely.wkt
from shapely.geometry.base import BaseGeometry
+
from ._compat import HAS_PYPROJ, requires_pyproj
from .sindex import SpatialIndex
+
if HAS_PYPROJ:
from pyproj import Transformer
+
TransformerFromCRS = lru_cache(Transformer.from_crs)
-_names = {'MISSING': None, 'NAG': None, 'POINT': 'Point', 'LINESTRING':
- 'LineString', 'LINEARRING': 'LinearRing', 'POLYGON': 'Polygon',
- 'MULTIPOINT': 'MultiPoint', 'MULTILINESTRING': 'MultiLineString',
- 'MULTIPOLYGON': 'MultiPolygon', 'GEOMETRYCOLLECTION': 'GeometryCollection'}
+
+_names = {
+ "MISSING": None,
+ "NAG": None,
+ "POINT": "Point",
+ "LINESTRING": "LineString",
+ "LINEARRING": "LinearRing",
+ "POLYGON": "Polygon",
+ "MULTIPOINT": "MultiPoint",
+ "MULTILINESTRING": "MultiLineString",
+ "MULTIPOLYGON": "MultiPolygon",
+ "GEOMETRYCOLLECTION": "GeometryCollection",
+}
+
+
type_mapping = {p.value: _names[p.name] for p in shapely.GeometryType}
geometry_type_ids = list(type_mapping.keys())
geometry_type_values = np.array(list(type_mapping.values()), dtype=object)
@@ -28,9 +48,26 @@ geometry_type_values = np.array(list(type_mapping.values()), dtype=object)
class GeometryDtype(ExtensionDtype):
type = BaseGeometry
- name = 'geometry'
+ name = "geometry"
na_value = np.nan
+ @classmethod
+ def construct_from_string(cls, string):
+ if not isinstance(string, str):
+ raise TypeError(
+ "'construct_from_string' expects a string, got {}".format(type(string))
+ )
+ elif string == cls.name:
+ return cls()
+ else:
+ raise TypeError(
+ "Cannot construct a '{}' from '{}'".format(cls.__name__, string)
+ )
+
+ @classmethod
+ def construct_array_type(cls):
+ return GeometryArray
+
register_extension_dtype(GeometryDtype)
@@ -41,14 +78,42 @@ def _check_crs(left, right, allow_none=False):
If allow_none is True, empty CRS is treated as the same.
"""
- pass
+ if allow_none:
+ if not left.crs or not right.crs:
+ return True
+ if not left.crs == right.crs:
+ return False
+ return True
def _crs_mismatch_warn(left, right, stacklevel=3):
"""
Raise a CRS mismatch warning with the information on the assigned CRS.
"""
- pass
+ if left.crs:
+ left_srs = left.crs.to_string()
+ left_srs = left_srs if len(left_srs) <= 50 else " ".join([left_srs[:50], "..."])
+ else:
+ left_srs = None
+
+ if right.crs:
+ right_srs = right.crs.to_string()
+ right_srs = (
+ right_srs if len(right_srs) <= 50 else " ".join([right_srs[:50], "..."])
+ )
+ else:
+ right_srs = None
+
+ warnings.warn(
+ "CRS mismatch between the CRS of left geometries "
+ "and the CRS of right geometries.\n"
+ "Use `to_crs()` to reproject one of "
+ "the input geometries to match the CRS of the other.\n\n"
+ "Left CRS: {0}\n"
+ "Right CRS: {1}\n".format(left_srs, right_srs),
+ UserWarning,
+ stacklevel=stacklevel,
+ )
def isna(value):
@@ -58,7 +123,23 @@ def isna(value):
Custom version that only works for scalars (returning True or False),
as `pd.isna` also works for array-like input returning a boolean array.
"""
- pass
+ if value is None:
+ return True
+ elif isinstance(value, float) and np.isnan(value):
+ return True
+ elif value is pd.NA:
+ return True
+ else:
+ return False
+
+
+# -----------------------------------------------------------------------------
+# Constructors / converters to other formats
+# -----------------------------------------------------------------------------
+
+
+def _is_scalar_geometry(geom):
+ return isinstance(geom, BaseGeometry)
def from_shapely(data, crs=None):
@@ -77,17 +158,42 @@ def from_shapely(data, crs=None):
such as an authority string (eg "EPSG:4326") or a WKT string.
"""
- pass
+ if not isinstance(data, np.ndarray):
+ arr = np.empty(len(data), dtype=object)
+ arr[:] = data
+ else:
+ arr = data
+
+ if not shapely.is_valid_input(arr).all():
+ out = []
+
+ for geom in data:
+ if isinstance(geom, BaseGeometry):
+ out.append(geom)
+ elif hasattr(geom, "__geo_interface__"):
+ geom = shapely.geometry.shape(geom)
+ out.append(geom)
+ elif isna(geom):
+ out.append(None)
+ else:
+ raise TypeError(
+ "Input must be valid geometry objects: {0}".format(geom)
+ )
+ arr = np.array(out, dtype=object)
+
+ return GeometryArray(arr, crs=crs)
def to_shapely(geoms):
"""
Convert GeometryArray to numpy object array of shapely objects.
"""
- pass
+ if not isinstance(geoms, GeometryArray):
+ raise ValueError("'geoms' must be a GeometryArray")
+ return geoms._data
-def from_wkb(data, crs=None, on_invalid='raise'):
+def from_wkb(data, crs=None, on_invalid="raise"):
"""
Convert a list or array of WKB objects to a GeometryArray.
@@ -106,17 +212,19 @@ def from_wkb(data, crs=None, on_invalid='raise'):
- ignore: invalid WKB geometries will be returned as None without a warning.
"""
- pass
+ return GeometryArray(shapely.from_wkb(data, on_invalid=on_invalid), crs=crs)
def to_wkb(geoms, hex=False, **kwargs):
"""
Convert GeometryArray to a numpy object array of WKB objects.
"""
- pass
+ if not isinstance(geoms, GeometryArray):
+ raise ValueError("'geoms' must be a GeometryArray")
+ return shapely.to_wkb(geoms, hex=hex, **kwargs)
-def from_wkt(data, crs=None, on_invalid='raise'):
+def from_wkt(data, crs=None, on_invalid="raise"):
"""
Convert a list or array of WKT objects to a GeometryArray.
@@ -135,14 +243,16 @@ def from_wkt(data, crs=None, on_invalid='raise'):
- ignore: invalid WKT geometries will be returned as ``None`` without a warning.
"""
- pass
+ return GeometryArray(shapely.from_wkt(data, on_invalid=on_invalid), crs=crs)
def to_wkt(geoms, **kwargs):
"""
Convert GeometryArray to a numpy object array of WKT objects.
"""
- pass
+ if not isinstance(geoms, GeometryArray):
+ raise ValueError("'geoms' must be a GeometryArray")
+ return shapely.to_wkt(geoms, **kwargs)
def points_from_xy(x, y, z=None, crs=None):
@@ -188,7 +298,12 @@ def points_from_xy(x, y, z=None, crs=None):
-------
output : GeometryArray
"""
- pass
+ x = np.asarray(x, dtype="float64")
+ y = np.asarray(y, dtype="float64")
+ if z is not None:
+ z = np.asarray(z, dtype="float64")
+
+ return GeometryArray(shapely.points(x, y, z), crs=crs)
class GeometryArray(ExtensionArray):
@@ -196,6 +311,7 @@ class GeometryArray(ExtensionArray):
Class wrapping a numpy array of Shapely objects and
holding the array-based implementations.
"""
+
_dtype = GeometryDtype()
def __init__(self, data, crs=None):
@@ -205,16 +321,25 @@ class GeometryArray(ExtensionArray):
data = data._data
elif not isinstance(data, np.ndarray):
raise TypeError(
- "'data' should be array of geometry objects. Use from_shapely, from_wkb, from_wkt functions to construct a GeometryArray."
- )
+ "'data' should be array of geometry objects. Use from_shapely, "
+ "from_wkb, from_wkt functions to construct a GeometryArray."
+ )
elif not data.ndim == 1:
raise ValueError(
- "'data' should be a 1-dimensional array of geometry objects.")
+ "'data' should be a 1-dimensional array of geometry objects."
+ )
self._data = data
+
self._crs = None
self.crs = crs
self._sindex = None
+ @property
+ def sindex(self):
+ if self._sindex is None:
+ self._sindex = SpatialIndex(self._data)
+ return self._sindex
+
@property
def has_sindex(self):
"""Check the existence of the spatial index without generating it.
@@ -237,7 +362,7 @@ class GeometryArray(ExtensionArray):
`True` if the spatial index has been generated or
`False` if not.
"""
- pass
+ return self._sindex is not None
@property
def crs(self):
@@ -251,16 +376,43 @@ class GeometryArray(ExtensionArray):
:meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
"""
- pass
+ return self._crs
@crs.setter
def crs(self, value):
"""Sets the value of the crs"""
- pass
+ if HAS_PYPROJ:
+ from pyproj import CRS
+
+ self._crs = None if not value else CRS.from_user_input(value)
+ else:
+ if value is not None:
+ warnings.warn(
+ "Cannot set the CRS, falling back to None. The CRS support requires"
+ " the 'pyproj' package, but it is not installed or does not import"
+ " correctly. The functions depending on CRS will raise an error or"
+ " may produce unexpected results.",
+ UserWarning,
+ stacklevel=2,
+ )
+ self._crs = None
def check_geographic_crs(self, stacklevel):
"""Check CRS and warn if the planar operation is done in a geographic CRS"""
- pass
+ if self.crs and self.crs.is_geographic:
+ warnings.warn(
+ "Geometry is in a geographic CRS. Results from '{}' are likely "
+ "incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a "
+ "projected CRS before this operation.\n".format(
+ inspect.stack()[1].function
+ ),
+ UserWarning,
+ stacklevel=stacklevel,
+ )
+
+ @property
+ def dtype(self):
+ return self._dtype
def __len__(self):
return self.shape[0]
@@ -268,10 +420,15 @@ class GeometryArray(ExtensionArray):
def __getitem__(self, idx):
if isinstance(idx, numbers.Integral):
return self._data[idx]
+ # array-like, slice
+ # validate and convert IntegerArray/BooleanArray
+ # to numpy array, pass-through non-array-like indexers
idx = pd.api.indexers.check_array_indexer(self, idx)
return GeometryArray(self._data[idx], crs=self.crs)
def __setitem__(self, key, value):
+ # validate and convert IntegerArray/BooleanArray
+ # keys to numpy array, pass-through non-array-like indexers
key = pd.api.indexers.check_array_indexer(self, key)
if isinstance(value, pd.Series):
value = value.values
@@ -281,15 +438,17 @@ class GeometryArray(ExtensionArray):
value = from_shapely(value)
if isinstance(value, GeometryArray):
if isinstance(key, numbers.Integral):
- raise ValueError('cannot set a single element with an array')
+ raise ValueError("cannot set a single element with an array")
self._data[key] = value._data
elif isinstance(value, BaseGeometry) or isna(value):
if isna(value):
+ # internally only use None as missing value indicator
+ # but accept others
value = None
elif isinstance(value, BaseGeometry):
value = from_shapely([value])._data[0]
else:
- raise TypeError('should be valid geometry')
+ raise TypeError("should be valid geometry")
if isinstance(key, (slice, list, np.ndarray)):
value_array = np.empty(1, dtype=object)
value_array[:] = [value]
@@ -298,27 +457,475 @@ class GeometryArray(ExtensionArray):
self._data[key] = value
else:
raise TypeError(
- 'Value should be either a BaseGeometry or None, got %s' %
- str(value))
+ "Value should be either a BaseGeometry or None, got %s" % str(value)
+ )
+
+ # invalidate spatial index
self._sindex = None
+ # TODO: use this once pandas-dev/pandas#33457 is fixed
+ # if hasattr(value, "crs"):
+ # if value.crs and (value.crs != self.crs):
+ # raise ValueError(
+ # "CRS mismatch between CRS of the passed geometries "
+ # "and CRS of existing geometries."
+ # )
+
def __getstate__(self):
- return shapely.to_wkb(self._data), self._crs
+ return (shapely.to_wkb(self._data), self._crs)
def __setstate__(self, state):
if not isinstance(state, dict):
+ # pickle file saved with pygeos
geoms = shapely.from_wkb(state[0])
self._crs = state[1]
- self._sindex = None
+ self._sindex = None # pygeos.STRtree could not be pickled yet
self._data = geoms
self.base = None
else:
- if 'data' in state:
- state['_data'] = state.pop('data')
- if '_crs' not in state:
- state['_crs'] = None
+ if "data" in state:
+ state["_data"] = state.pop("data")
+ if "_crs" not in state:
+ state["_crs"] = None
self.__dict__.update(state)
+ # -------------------------------------------------------------------------
+ # Geometry related methods
+ # -------------------------------------------------------------------------
+
+ @property
+ def is_valid(self):
+ return shapely.is_valid(self._data)
+
+ def is_valid_reason(self):
+ return shapely.is_valid_reason(self._data)
+
+ @property
+ def is_empty(self):
+ return shapely.is_empty(self._data)
+
+ @property
+ def is_simple(self):
+ return shapely.is_simple(self._data)
+
+ @property
+ def is_ring(self):
+ return shapely.is_ring(self._data)
+
+ @property
+ def is_closed(self):
+ return shapely.is_closed(self._data)
+
+ @property
+ def is_ccw(self):
+ return shapely.is_ccw(self._data)
+
+ @property
+ def has_z(self):
+ return shapely.has_z(self._data)
+
+ @property
+ def geom_type(self):
+ res = shapely.get_type_id(self._data)
+ return geometry_type_values[np.searchsorted(geometry_type_ids, res)]
+
+ @property
+ def area(self):
+ self.check_geographic_crs(stacklevel=5)
+ return shapely.area(self._data)
+
+ @property
+ def length(self):
+ self.check_geographic_crs(stacklevel=5)
+ return shapely.length(self._data)
+
+ def count_coordinates(self):
+ return shapely.get_num_coordinates(self._data)
+
+ def count_geometries(self):
+ return shapely.get_num_geometries(self._data)
+
+ def count_interior_rings(self):
+ return shapely.get_num_interior_rings(self._data)
+
+ def get_precision(self):
+ return shapely.get_precision(self._data)
+
+ def get_geometry(self, index):
+ return shapely.get_geometry(self._data, index=index)
+
+ #
+ # Unary operations that return new geometries
+ #
+
+ @property
+ def boundary(self):
+ return GeometryArray(shapely.boundary(self._data), crs=self.crs)
+
+ @property
+ def centroid(self):
+ self.check_geographic_crs(stacklevel=5)
+ return GeometryArray(shapely.centroid(self._data), crs=self.crs)
+
+ def concave_hull(self, ratio, allow_holes):
+ return shapely.concave_hull(self._data, ratio=ratio, allow_holes=allow_holes)
+
+ @property
+ def convex_hull(self):
+ return GeometryArray(shapely.convex_hull(self._data), crs=self.crs)
+
+ @property
+ def envelope(self):
+ return GeometryArray(shapely.envelope(self._data), crs=self.crs)
+
+ def minimum_rotated_rectangle(self):
+ return GeometryArray(shapely.oriented_envelope(self._data), crs=self.crs)
+
+ @property
+ def exterior(self):
+ return GeometryArray(shapely.get_exterior_ring(self._data), crs=self.crs)
+
+ def extract_unique_points(self):
+ return GeometryArray(shapely.extract_unique_points(self._data), crs=self.crs)
+
+ def offset_curve(self, distance, quad_segs=8, join_style="round", mitre_limit=5.0):
+ return GeometryArray(
+ shapely.offset_curve(
+ self._data,
+ distance,
+ quad_segs=quad_segs,
+ join_style=join_style,
+ mitre_limit=mitre_limit,
+ ),
+ crs=self.crs,
+ )
+
+ @property
+ def interiors(self):
+ # no GeometryArray as result
+ has_non_poly = False
+ inner_rings = []
+ for geom in self._data:
+ interior_ring_seq = getattr(geom, "interiors", None)
+ # polygon case
+ if interior_ring_seq is not None:
+ inner_rings.append(list(interior_ring_seq))
+ # non-polygon case
+ else:
+ has_non_poly = True
+ inner_rings.append(None)
+ if has_non_poly:
+ warnings.warn(
+ "Only Polygon objects have interior rings. For other "
+ "geometry types, None is returned.",
+ stacklevel=2,
+ )
+ # need to allocate empty first in case of all empty lists in inner_rings
+ data = np.empty(len(inner_rings), dtype=object)
+ data[:] = inner_rings
+ return data
+
+ def remove_repeated_points(self, tolerance=0.0):
+ return GeometryArray(
+ shapely.remove_repeated_points(self._data, tolerance=tolerance),
+ crs=self.crs,
+ )
+
+ def representative_point(self):
+ return GeometryArray(shapely.point_on_surface(self._data), crs=self.crs)
+
+ def minimum_bounding_circle(self):
+ return GeometryArray(shapely.minimum_bounding_circle(self._data), crs=self.crs)
+
+ def minimum_bounding_radius(self):
+ return shapely.minimum_bounding_radius(self._data)
+
+ def minimum_clearance(self):
+ return shapely.minimum_clearance(self._data)
+
+ def normalize(self):
+ return GeometryArray(shapely.normalize(self._data), crs=self.crs)
+
+ def make_valid(self):
+ return GeometryArray(shapely.make_valid(self._data), crs=self.crs)
+
+ def reverse(self):
+ return GeometryArray(shapely.reverse(self._data), crs=self.crs)
+
+ def segmentize(self, max_segment_length):
+ return GeometryArray(
+ shapely.segmentize(self._data, max_segment_length),
+ crs=self.crs,
+ )
+
+ def force_2d(self):
+ return GeometryArray(shapely.force_2d(self._data), crs=self.crs)
+
+ def force_3d(self, z=0):
+ return GeometryArray(shapely.force_3d(self._data, z=z), crs=self.crs)
+
+ def transform(self, transformation, include_z=False):
+ return GeometryArray(
+ shapely.transform(self._data, transformation, include_z), crs=self.crs
+ )
+
+ def line_merge(self, directed=False):
+ return GeometryArray(
+ shapely.line_merge(self._data, directed=directed), crs=self.crs
+ )
+
+ def set_precision(self, grid_size, mode="valid_output"):
+ return GeometryArray(
+ shapely.set_precision(self._data, grid_size=grid_size, mode=mode),
+ crs=self.crs,
+ )
+
+ #
+ # Binary predicates
+ #
+
+ @staticmethod
+ def _binary_method(op, left, right, **kwargs):
+ if isinstance(right, GeometryArray):
+ if len(left) != len(right):
+ msg = "Lengths of inputs do not match. Left: {0}, Right: {1}".format(
+ len(left), len(right)
+ )
+ raise ValueError(msg)
+ if not _check_crs(left, right):
+ _crs_mismatch_warn(left, right, stacklevel=7)
+ right = right._data
+
+ return getattr(shapely, op)(left._data, right, **kwargs)
+
+ def covers(self, other):
+ return self._binary_method("covers", self, other)
+
+ def covered_by(self, other):
+ return self._binary_method("covered_by", self, other)
+
+ def contains(self, other):
+ return self._binary_method("contains", self, other)
+
+ def contains_properly(self, other):
+ return self._binary_method("contains_properly", self, other)
+
+ def crosses(self, other):
+ return self._binary_method("crosses", self, other)
+
+ def disjoint(self, other):
+ return self._binary_method("disjoint", self, other)
+
+ def geom_equals(self, other):
+ return self._binary_method("equals", self, other)
+
+ def intersects(self, other):
+ return self._binary_method("intersects", self, other)
+
+ def overlaps(self, other):
+ return self._binary_method("overlaps", self, other)
+
+ def touches(self, other):
+ return self._binary_method("touches", self, other)
+
+ def within(self, other):
+ return self._binary_method("within", self, other)
+
+ def dwithin(self, other, distance):
+ self.check_geographic_crs(stacklevel=6)
+ return self._binary_method("dwithin", self, other, distance=distance)
+
+ def geom_equals_exact(self, other, tolerance):
+ return self._binary_method("equals_exact", self, other, tolerance=tolerance)
+
+ def geom_almost_equals(self, other, decimal):
+ warnings.warn(
+ "The 'geom_almost_equals()' method is deprecated because the name is "
+ "confusing. The 'geom_equals_exact()' method should be used instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self.geom_equals_exact(other, 0.5 * 10 ** (-decimal))
+
+ #
+ # Binary operations that return new geometries
+ #
+
+ def clip_by_rect(self, xmin, ymin, xmax, ymax):
+ return GeometryArray(
+ shapely.clip_by_rect(self._data, xmin, ymin, xmax, ymax), crs=self.crs
+ )
+
+ def difference(self, other):
+ return GeometryArray(
+ self._binary_method("difference", self, other), crs=self.crs
+ )
+
+ def intersection(self, other):
+ return GeometryArray(
+ self._binary_method("intersection", self, other), crs=self.crs
+ )
+
+ def symmetric_difference(self, other):
+ return GeometryArray(
+ self._binary_method("symmetric_difference", self, other), crs=self.crs
+ )
+
+ def union(self, other):
+ return GeometryArray(self._binary_method("union", self, other), crs=self.crs)
+
+ def shortest_line(self, other):
+ return GeometryArray(
+ self._binary_method("shortest_line", self, other), crs=self.crs
+ )
+
+ def snap(self, other, tolerance):
+ return GeometryArray(
+ self._binary_method("snap", self, other, tolerance=tolerance), crs=self.crs
+ )
+
+ def shared_paths(self, other):
+ return GeometryArray(
+ self._binary_method("shared_paths", self, other), crs=self.crs
+ )
+
+ #
+ # Other operations
+ #
+
+ def distance(self, other):
+ self.check_geographic_crs(stacklevel=6)
+ return self._binary_method("distance", self, other)
+
+ def hausdorff_distance(self, other, **kwargs):
+ self.check_geographic_crs(stacklevel=6)
+ return self._binary_method("hausdorff_distance", self, other, **kwargs)
+
+ def frechet_distance(self, other, **kwargs):
+ self.check_geographic_crs(stacklevel=6)
+ return self._binary_method("frechet_distance", self, other, **kwargs)
+
+ def buffer(self, distance, resolution=16, **kwargs):
+ if not (isinstance(distance, (int, float)) and distance == 0):
+ self.check_geographic_crs(stacklevel=5)
+ return GeometryArray(
+ shapely.buffer(self._data, distance, quad_segs=resolution, **kwargs),
+ crs=self.crs,
+ )
+
+ def interpolate(self, distance, normalized=False):
+ self.check_geographic_crs(stacklevel=5)
+ return GeometryArray(
+ shapely.line_interpolate_point(self._data, distance, normalized=normalized),
+ crs=self.crs,
+ )
+
+ def simplify(self, tolerance, preserve_topology=True):
+ return GeometryArray(
+ shapely.simplify(
+ self._data, tolerance, preserve_topology=preserve_topology
+ ),
+ crs=self.crs,
+ )
+
+ def project(self, other, normalized=False):
+ if isinstance(other, GeometryArray):
+ other = other._data
+ return shapely.line_locate_point(self._data, other, normalized=normalized)
+
+ def relate(self, other):
+ if isinstance(other, GeometryArray):
+ other = other._data
+ return shapely.relate(self._data, other)
+
+ def relate_pattern(self, other, pattern):
+ if isinstance(other, GeometryArray):
+ other = other._data
+ return shapely.relate_pattern(self._data, other, pattern)
+
+ #
+ # Reduction operations that return a Shapely geometry
+ #
+
+ def unary_union(self):
+ warnings.warn(
+ "The 'unary_union' attribute is deprecated, "
+ "use the 'union_all' method instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return self.union_all()
+
+ def union_all(self, method="unary"):
+ if method == "coverage":
+ return shapely.coverage_union_all(self._data)
+ elif method == "unary":
+ return shapely.union_all(self._data)
+ else:
+ raise ValueError(
+ f"Method '{method}' not recognized. Use 'coverage' or 'unary'."
+ )
+
+ def intersection_all(self):
+ return shapely.intersection_all(self._data)
+
+ #
+ # Affinity operations
+ #
+
+ @staticmethod
+ def _affinity_method(op, left, *args, **kwargs):
+ # not all shapely.affinity methods can handle empty geometries:
+ # affine_transform itself works (as well as translate), but rotate, scale
+ # and skew fail (they try to unpack the bounds).
+ # Here: consistently returning empty geom for input empty geom
+ out = []
+ for geom in left:
+ if geom is None or geom.is_empty:
+ res = geom
+ else:
+ res = getattr(shapely.affinity, op)(geom, *args, **kwargs)
+ out.append(res)
+ data = np.empty(len(left), dtype=object)
+ data[:] = out
+ return data
+
+ def affine_transform(self, matrix):
+ return GeometryArray(
+ self._affinity_method("affine_transform", self._data, matrix),
+ crs=self.crs,
+ )
+
+ def translate(self, xoff=0.0, yoff=0.0, zoff=0.0):
+ return GeometryArray(
+ self._affinity_method("translate", self._data, xoff, yoff, zoff),
+ crs=self.crs,
+ )
+
+ def rotate(self, angle, origin="center", use_radians=False):
+ return GeometryArray(
+ self._affinity_method(
+ "rotate", self._data, angle, origin=origin, use_radians=use_radians
+ ),
+ crs=self.crs,
+ )
+
+ def scale(self, xfact=1.0, yfact=1.0, zfact=1.0, origin="center"):
+ return GeometryArray(
+ self._affinity_method(
+ "scale", self._data, xfact, yfact, zfact, origin=origin
+ ),
+ crs=self.crs,
+ )
+
+ def skew(self, xs=0.0, ys=0.0, origin="center", use_radians=False):
+ return GeometryArray(
+ self._affinity_method(
+ "skew", self._data, xs, ys, origin=origin, use_radians=use_radians
+ ),
+ crs=self.crs,
+ )
+
@requires_pyproj
def to_crs(self, crs=None, epsg=None):
"""Returns a ``GeometryArray`` with all geometries transformed to a new
@@ -389,10 +996,31 @@ class GeometryArray(ExtensionArray):
- Prime Meridian: Greenwich
"""
- pass
+ from pyproj import CRS
+
+ if self.crs is None:
+ raise ValueError(
+ "Cannot transform naive geometries. "
+ "Please set a crs on the object first."
+ )
+ if crs is not None:
+ crs = CRS.from_user_input(crs)
+ elif epsg is not None:
+ crs = CRS.from_epsg(epsg)
+ else:
+ raise ValueError("Must pass either crs or epsg.")
+
+ # skip if the input CRS and output CRS are the exact same
+ if self.crs.is_exact_same(crs):
+ return self
+
+ transformer = TransformerFromCRS(self.crs, crs, always_xy=True)
+
+ new_data = transform(self._data, transformer.transform)
+ return GeometryArray(new_data, crs=crs)
@requires_pyproj
- def estimate_utm_crs(self, datum_name='WGS 84'):
+ def estimate_utm_crs(self, datum_name="WGS 84"):
"""Returns the estimated UTM CRS based on the bounds of the dataset.
.. versionadded:: 0.9
@@ -430,22 +1058,168 @@ class GeometryArray(ExtensionArray):
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
"""
- pass
+ from pyproj import CRS
+ from pyproj.aoi import AreaOfInterest
+ from pyproj.database import query_utm_crs_info
+
+ if not self.crs:
+ raise RuntimeError("crs must be set to estimate UTM CRS.")
+
+ minx, miny, maxx, maxy = self.total_bounds
+ if self.crs.is_geographic:
+ x_center = np.mean([minx, maxx])
+ y_center = np.mean([miny, maxy])
+ # ensure using geographic coordinates
+ else:
+ transformer = TransformerFromCRS(self.crs, "EPSG:4326", always_xy=True)
+ minx, miny, maxx, maxy = transformer.transform_bounds(
+ minx, miny, maxx, maxy
+ )
+ y_center = np.mean([miny, maxy])
+ # crossed the antimeridian
+ if minx > maxx:
+ # shift maxx from [-180,180] to [0,360]
+ # so both numbers are positive for center calculation
+ # Example: -175 to 185
+ maxx += 360
+ x_center = np.mean([minx, maxx])
+ # shift back to [-180,180]
+ x_center = ((x_center + 180) % 360) - 180
+ else:
+ x_center = np.mean([minx, maxx])
+
+ utm_crs_list = query_utm_crs_info(
+ datum_name=datum_name,
+ area_of_interest=AreaOfInterest(
+ west_lon_degree=x_center,
+ south_lat_degree=y_center,
+ east_lon_degree=x_center,
+ north_lat_degree=y_center,
+ ),
+ )
+ try:
+ return CRS.from_epsg(utm_crs_list[0].code)
+ except IndexError:
+ raise RuntimeError("Unable to determine UTM CRS")
+
+ #
+ # Coordinate related properties
+ #
@property
def x(self):
"""Return the x location of point geometries in a GeoSeries"""
- pass
+ if (self.geom_type[~self.isna()] == "Point").all():
+ empty = self.is_empty
+ if empty.any():
+ nonempty = ~empty
+ coords = np.full_like(nonempty, dtype=float, fill_value=np.nan)
+ coords[nonempty] = shapely.get_x(self._data[nonempty])
+ return coords
+ else:
+ return shapely.get_x(self._data)
+ else:
+ message = "x attribute access only provided for Point geometries"
+ raise ValueError(message)
@property
def y(self):
"""Return the y location of point geometries in a GeoSeries"""
- pass
+ if (self.geom_type[~self.isna()] == "Point").all():
+ empty = self.is_empty
+ if empty.any():
+ nonempty = ~empty
+ coords = np.full_like(nonempty, dtype=float, fill_value=np.nan)
+ coords[nonempty] = shapely.get_y(self._data[nonempty])
+ return coords
+ else:
+ return shapely.get_y(self._data)
+ else:
+ message = "y attribute access only provided for Point geometries"
+ raise ValueError(message)
@property
def z(self):
"""Return the z location of point geometries in a GeoSeries"""
- pass
+ if (self.geom_type[~self.isna()] == "Point").all():
+ empty = self.is_empty
+ if empty.any():
+ nonempty = ~empty
+ coords = np.full_like(nonempty, dtype=float, fill_value=np.nan)
+ coords[nonempty] = shapely.get_z(self._data[nonempty])
+ return coords
+ else:
+ return shapely.get_z(self._data)
+ else:
+ message = "z attribute access only provided for Point geometries"
+ raise ValueError(message)
+
+ @property
+ def bounds(self):
+ return shapely.bounds(self._data)
+
+ @property
+ def total_bounds(self):
+ if len(self) == 0:
+ # numpy 'min' cannot handle empty arrays
+ # TODO with numpy >= 1.15, the 'initial' argument can be used
+ return np.array([np.nan, np.nan, np.nan, np.nan])
+ b = self.bounds
+ with warnings.catch_warnings():
+ # if all rows are empty geometry / none, nan is expected
+ warnings.filterwarnings(
+ "ignore", r"All-NaN slice encountered", RuntimeWarning
+ )
+ return np.array(
+ (
+ np.nanmin(b[:, 0]), # minx
+ np.nanmin(b[:, 1]), # miny
+ np.nanmax(b[:, 2]), # maxx
+ np.nanmax(b[:, 3]), # maxy
+ )
+ )
+
+ # -------------------------------------------------------------------------
+ # general array like compat
+ # -------------------------------------------------------------------------
+
+ @property
+ def size(self):
+ return self._data.size
+
+ @property
+ def shape(self):
+ return (self.size,)
+
+ @property
+ def ndim(self):
+ return len(self.shape)
+
+ def copy(self, *args, **kwargs):
+ # still taking args/kwargs for compat with pandas 0.24
+ return GeometryArray(self._data.copy(), crs=self._crs)
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ from pandas.api.extensions import take
+
+ if allow_fill:
+ if fill_value is None or pd.isna(fill_value):
+ fill_value = None
+ elif not _is_scalar_geometry(fill_value):
+ raise TypeError("provide geometry or None as fill value")
+
+ result = take(self._data, indices, allow_fill=allow_fill, fill_value=fill_value)
+ if allow_fill and fill_value is None:
+ result[~shapely.is_valid_input(result)] = None
+ return GeometryArray(result, crs=self.crs)
+
+ # compat for pandas < 3.0
+ def _pad_or_backfill(
+ self, method, limit=None, limit_area=None, copy=True, **kwargs
+ ):
+ return super()._pad_or_backfill(
+ method=method, limit=limit, limit_area=limit_area, copy=copy, **kwargs
+ )
def fillna(self, value=None, method=None, limit=None, copy=True):
"""
@@ -474,7 +1248,38 @@ class GeometryArray(ExtensionArray):
-------
GeometryArray
"""
- pass
+ if method is not None:
+ raise NotImplementedError("fillna with a method is not yet supported")
+
+ mask = self.isna()
+ if copy:
+ new_values = self.copy()
+ else:
+ new_values = self
+
+ if not mask.any():
+ return new_values
+
+ if limit is not None and limit < len(self):
+ modify = mask.cumsum() > limit
+ if modify.any():
+ mask[modify] = False
+
+ if isna(value):
+ value = [None]
+ elif _is_scalar_geometry(value):
+ value = [value]
+ elif isinstance(value, GeometryArray):
+ value = value[mask]
+ else:
+ raise TypeError(
+ "'value' parameter must be None, a scalar geometry, or a GeoSeries, "
+ f"but you passed a {type(value).__name__!r}"
+ )
+ value_arr = np.asarray(value, dtype=object)
+
+ new_values._data[mask] = value_arr
+ return new_values
def astype(self, dtype, copy=True):
"""
@@ -494,15 +1299,39 @@ class GeometryArray(ExtensionArray):
array : ndarray
NumPy ndarray with 'dtype' for its dtype.
"""
- pass
+ if isinstance(dtype, GeometryDtype):
+ if copy:
+ return self.copy()
+ else:
+ return self
+ elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype(
+ dtype
+ ):
+ string_values = to_wkt(self)
+ pd_dtype = pd.api.types.pandas_dtype(dtype)
+ if isinstance(pd_dtype, pd.StringDtype):
+ # ensure to return a pandas string array instead of numpy array
+ return pd.array(string_values, dtype=pd_dtype)
+ return string_values.astype(dtype, copy=False)
+ else:
+ # numpy 2.0 makes copy=False case strict (errors if cannot avoid the copy)
+ # -> in that case use `np.asarray` as backwards compatible alternative
+ # for `copy=None` (when requiring numpy 2+, this can be cleaned up)
+ if not copy:
+ return np.asarray(self, dtype=dtype)
+ else:
+ return np.array(self, dtype=dtype, copy=copy)
def isna(self):
"""
Boolean NumPy array indicating if each value is missing
"""
- pass
+ return shapely.is_missing(self._data)
- def value_counts(self, dropna: bool=True):
+ def value_counts(
+ self,
+ dropna: bool = True,
+ ):
"""
Compute a histogram of the counts of non-null values.
@@ -515,7 +1344,21 @@ class GeometryArray(ExtensionArray):
-------
pd.Series
"""
- pass
+
+ # note ExtensionArray usage of value_counts only specifies dropna,
+ # so sort, normalize and bins are not arguments
+ values = to_wkb(self)
+ from pandas import Index, Series
+
+ result = Series(values).value_counts(dropna=dropna)
+ # value_counts converts None to nan, need to convert back for from_wkb to work
+ # note result.index already has object dtype, not geometry
+ # Can't use fillna(None) or Index.putmask, as this gets converted back to nan
+ # for object dtypes
+ result.index = Index(
+ from_wkb(np.where(result.index.isna(), None, result.index))
+ )
+ return result
def unique(self):
"""Compute the ExtensionArray of unique values.
@@ -524,7 +1367,14 @@ class GeometryArray(ExtensionArray):
-------
uniques : ExtensionArray
"""
- pass
+ from pandas import factorize
+
+ _, uniques = factorize(self)
+ return uniques
+
+ @property
+ def nbytes(self):
+ return self._data.nbytes
def shift(self, periods=1, fill_value=None):
"""
@@ -557,7 +1407,13 @@ class GeometryArray(ExtensionArray):
len(self) is returned, with all values filled with
``self.dtype.na_value``.
"""
- pass
+ shifted = super().shift(periods, fill_value)
+ shifted.crs = self.crs
+ return shifted
+
+ # -------------------------------------------------------------------------
+ # ExtensionArray specific
+ # -------------------------------------------------------------------------
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
@@ -579,7 +1435,10 @@ class GeometryArray(ExtensionArray):
-------
ExtensionArray
"""
- pass
+ # GH 1413
+ if isinstance(scalars, BaseGeometry):
+ scalars = [scalars]
+ return from_shapely(scalars)
@classmethod
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False):
@@ -601,9 +1460,11 @@ class GeometryArray(ExtensionArray):
-------
ExtensionArray
"""
- pass
+ # GH 3099
+ return from_wkt(strings)
def _values_for_factorize(self):
+ # type: () -> Tuple[np.ndarray, Any]
"""Return an array and missing value suitable for factorization.
Returns
@@ -618,7 +1479,8 @@ class GeometryArray(ExtensionArray):
`na_sentinal` and not included in `uniques`. By default,
``np.nan`` is used.
"""
- pass
+ vals = to_wkb(self)
+ return vals, None
@classmethod
def _from_factorized(cls, values, original):
@@ -637,9 +1499,10 @@ class GeometryArray(ExtensionArray):
pandas.factorize
ExtensionArray.factorize
"""
- pass
+ return from_wkb(values, crs=original.crs)
def _values_for_argsort(self):
+ # type: () -> np.ndarray
"""Return values for sorting.
Returns
@@ -652,7 +1515,56 @@ class GeometryArray(ExtensionArray):
--------
ExtensionArray.argsort
"""
- pass
+ # Note: this is used in `ExtensionArray.argsort`.
+ from geopandas.tools.hilbert_curve import _hilbert_distance
+
+ if self.size == 0:
+ # TODO _hilbert_distance fails for empty array
+ return np.array([], dtype="uint32")
+
+ mask_empty = self.is_empty
+ has_empty = mask_empty.any()
+ mask = self.isna() | mask_empty
+ if mask.any():
+ # if there are missing or empty geometries, we fill those with
+ # a dummy geometry so that the _hilbert_distance function can
+ # process those. The missing values are handled separately by
+ # pandas regardless of the values we return here (to sort
+ # first/last depending on 'na_position'), the distances for the
+ # empty geometries are substitued below with an appropriate value
+ geoms = self.copy()
+ indices = np.nonzero(~mask)[0]
+ if indices.size:
+ geom = self[indices[0]]
+ else:
+ # for all-empty/NA, just take random geometry
+ geom = shapely.geometry.Point(0, 0)
+
+ geoms[mask] = geom
+ else:
+ geoms = self
+ if has_empty:
+ # in case we have empty geometries, we need to expand the total
+ # bounds with a small percentage, so the empties can be
+ # deterministically sorted first
+ total_bounds = geoms.total_bounds
+ xoff = (total_bounds[2] - total_bounds[0]) * 0.01
+ yoff = (total_bounds[3] - total_bounds[1]) * 0.01
+ total_bounds += np.array([-xoff, -yoff, xoff, yoff])
+ else:
+ total_bounds = None
+ distances = _hilbert_distance(geoms, total_bounds=total_bounds)
+ if has_empty:
+ # empty geometries are sorted first ("smallest"), so fill in
+ # smallest possible value for uints
+ distances[mask_empty] = 0
+ return distances
+
+ def argmin(self, skipna: bool = True) -> int:
+ raise TypeError("geometries have no minimum or maximum")
+
+ def argmax(self, skipna: bool = True) -> int:
+ raise TypeError("geometries have no minimum or maximum")
def _formatter(self, boxed=False):
"""Formatting function for scalar values.
@@ -677,7 +1589,37 @@ class GeometryArray(ExtensionArray):
when ``boxed=False`` and :func:`str` is used when
``boxed=True``.
"""
- pass
+ if boxed:
+ import geopandas
+
+ precision = geopandas.options.display_precision
+ if precision is None:
+ if self.crs:
+ if self.crs.is_projected:
+ precision = 3
+ else:
+ precision = 5
+ else:
+ # fallback
+ # dummy heuristic based on 10 first geometries that should
+ # work in most cases
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", category=RuntimeWarning)
+ xmin, ymin, xmax, ymax = self[~self.isna()][:10].total_bounds
+ if (
+ (-180 <= xmin <= 180)
+ and (-180 <= xmax <= 180)
+ and (-90 <= ymin <= 90)
+ and (-90 <= ymax <= 90)
+ ):
+ # geographic coordinates
+ precision = 5
+ else:
+ # typically projected coordinates
+ # (in case of unit meter: mm precision)
+ precision = 3
+ return lambda geom: shapely.to_wkt(geom, rounding_precision=precision)
+ return repr
@classmethod
def _concat_same_type(cls, to_concat):
@@ -692,7 +1634,18 @@ class GeometryArray(ExtensionArray):
-------
ExtensionArray
"""
- pass
+ data = np.concatenate([ga._data for ga in to_concat])
+ return GeometryArray(data, crs=_get_common_crs(to_concat))
+
+ def _reduce(self, name, skipna=True, **kwargs):
+ # including the base class version here (that raises by default)
+ # because this was not yet defined in pandas 0.23
+ if name in ("any", "all"):
+ return getattr(to_shapely(self), name)()
+ raise TypeError(
+ f"'{type(self).__name__}' with dtype {self.dtype} "
+ f"does not support reduction '{name}'"
+ )
def __array__(self, dtype=None, copy=None):
"""
@@ -702,10 +1655,37 @@ class GeometryArray(ExtensionArray):
-------
values : numpy array
"""
- if copy and (dtype is None or dtype == np.dtype('object')):
+ if copy and (dtype is None or dtype == np.dtype("object")):
return self._data.copy()
return self._data
+ def _binop(self, other, op):
+ def convert_values(param):
+ if not _is_scalar_geometry(param) and (
+ isinstance(param, ExtensionArray) or pd.api.types.is_list_like(param)
+ ):
+ ovalues = param
+ else: # Assume its an object
+ ovalues = [param] * len(self)
+ return ovalues
+
+ if isinstance(other, (pd.Series, pd.Index, pd.DataFrame)):
+ # rely on pandas to unbox and dispatch to us
+ return NotImplemented
+
+ lvalues = self
+ rvalues = convert_values(other)
+
+ if len(lvalues) != len(rvalues):
+ raise ValueError("Lengths must match to compare")
+
+ # If the operator is not defined for the underlying objects,
+ # a TypeError should be raised
+ res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
+
+ res = np.asarray(res, dtype=bool)
+ return res
+
def __eq__(self, other):
return self._binop(other, operator.eq)
@@ -717,9 +1697,64 @@ class GeometryArray(ExtensionArray):
Return for `item in self`.
"""
if isna(item):
- if item is self.dtype.na_value or isinstance(item, self.dtype.type
- ) or item is None:
+ if (
+ item is self.dtype.na_value
+ or isinstance(item, self.dtype.type)
+ or item is None
+ ):
return self.isna().any()
else:
return False
return (self == item).any()
+
+
+def _get_common_crs(arr_seq):
+ # mask out all None arrays with no crs (most likely auto generated by pandas
+ # from concat with missing column)
+ arr_seq = [ga for ga in arr_seq if not (ga.isna().all() and ga.crs is None)]
+ # determine unique crs without using a set, because CRS hash can be different
+ # for objects with the same CRS
+ unique_crs = []
+ for arr in arr_seq:
+ if arr.crs not in unique_crs:
+ unique_crs.append(arr.crs)
+
+ crs_not_none = [crs for crs in unique_crs if crs is not None]
+ names = [crs.name for crs in crs_not_none]
+
+ if len(crs_not_none) == 0:
+ return None
+ if len(crs_not_none) == 1:
+ if len(unique_crs) != 1:
+ warnings.warn(
+ "CRS not set for some of the concatenation inputs. "
+ f"Setting output's CRS as {names[0]} "
+ "(the single non-null crs provided).",
+ stacklevel=2,
+ )
+ return crs_not_none[0]
+
+ raise ValueError(
+ f"Cannot determine common CRS for concatenation inputs, got {names}. "
+ "Use `to_crs()` to transform geometries to the same CRS before merging."
+ )
+
+
+def transform(data, func):
+ has_z = shapely.has_z(data)
+
+ result = np.empty_like(data)
+
+ coords = shapely.get_coordinates(data[~has_z], include_z=False)
+ new_coords_z = func(coords[:, 0], coords[:, 1])
+ result[~has_z] = shapely.set_coordinates(
+ data[~has_z].copy(), np.array(new_coords_z).T
+ )
+
+ coords_z = shapely.get_coordinates(data[has_z], include_z=True)
+ new_coords_z = func(coords_z[:, 0], coords_z[:, 1], coords_z[:, 2])
+ result[has_z] = shapely.set_coordinates(
+ data[has_z].copy(), np.array(new_coords_z).T
+ )
+
+ return result
diff --git a/geopandas/base.py b/geopandas/base.py
index a0f746e2..5e2729e2 100644
--- a/geopandas/base.py
+++ b/geopandas/base.py
@@ -1,11 +1,14 @@
import warnings
from warnings import warn
+
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
+
import shapely
from shapely.geometry import MultiPoint, box
from shapely.geometry.base import BaseGeometry
+
from . import _compat as compat
from .array import GeometryArray, GeometryDtype, points_from_xy
@@ -16,26 +19,108 @@ def is_geometry_type(data):
Does not include object array of shapely scalars.
"""
- pass
+ if isinstance(getattr(data, "dtype", None), GeometryDtype):
+ # GeometryArray, GeoSeries and Series[GeometryArray]
+ return True
+ else:
+ return False
+
+
+def _delegate_binary_method(op, this, other, align, *args, **kwargs):
+ # type: (str, GeoSeries, GeoSeries) -> GeoSeries/Series
+ if align is None:
+ align = True
+ maybe_warn = True
+ else:
+ maybe_warn = False
+ this = this.geometry
+ if isinstance(other, GeoPandasBase):
+ if align and not this.index.equals(other.index):
+ if maybe_warn:
+ warn(
+ "The indices of the left and right GeoSeries' are not equal, and "
+ "therefore they will be aligned (reordering and/or introducing "
+ "missing values) before executing the operation. If this alignment "
+ "is the desired behaviour, you can silence this warning by passing "
+ "'align=True'. If you don't want alignment and protect yourself of "
+ "accidentally aligning, you can pass 'align=False'.",
+ stacklevel=4,
+ )
+ this, other = this.align(other.geometry)
+ else:
+ other = other.geometry
+
+ a_this = GeometryArray(this.values)
+ other = GeometryArray(other.values)
+ elif isinstance(other, BaseGeometry):
+ a_this = GeometryArray(this.values)
+ else:
+ raise TypeError(type(this), type(other))
+
+ data = getattr(a_this, op)(other, *args, **kwargs)
+ return data, this.index
def _binary_geo(op, this, other, align, *args, **kwargs):
+ # type: (str, GeoSeries, GeoSeries) -> GeoSeries
"""Binary operation on GeoSeries objects that returns a GeoSeries"""
- pass
+ from .geoseries import GeoSeries
+
+ geoms, index = _delegate_binary_method(op, this, other, align, *args, **kwargs)
+ return GeoSeries(geoms, index=index, crs=this.crs)
def _binary_op(op, this, other, align, *args, **kwargs):
+ # type: (str, GeoSeries, GeoSeries, args/kwargs) -> Series[bool/float]
"""Binary operation on GeoSeries objects that returns a Series"""
- pass
+ data, index = _delegate_binary_method(op, this, other, align, *args, **kwargs)
+ return Series(data, index=index)
+
+
+def _delegate_property(op, this):
+ # type: (str, GeoSeries) -> GeoSeries/Series
+ a_this = GeometryArray(this.geometry.values)
+ data = getattr(a_this, op)
+ if isinstance(data, GeometryArray):
+ from .geoseries import GeoSeries
+
+ return GeoSeries(data, index=this.index, crs=this.crs)
+ else:
+ return Series(data, index=this.index)
def _delegate_geo_method(op, this, **kwargs):
+ # type: (str, GeoSeries) -> GeoSeries
"""Unary operation that returns a GeoSeries"""
- pass
+ from .geodataframe import GeoDataFrame
+ from .geoseries import GeoSeries
+
+ if isinstance(this, GeoSeries):
+ klass, var_name = "GeoSeries", "gs"
+ elif isinstance(this, GeoDataFrame):
+ klass, var_name = "GeoDataFrame", "gdf"
+ else:
+ klass, var_name = this.__class__.__name__, "this"
+
+ for key, val in kwargs.items():
+ if isinstance(val, pd.Series):
+ if not val.index.equals(this.index):
+ raise ValueError(
+ f"Index of the Series passed as '{key}' does not match index of "
+ f"the {klass}. If you want both Series to be aligned, align them "
+ f"before passing them to this method as "
+ f"`{var_name}, {key} = {var_name}.align({key})`. If "
+ f"you want to ignore the index, pass the underlying array as "
+ f"'{key}' using `{key}.values`."
+ )
+ kwargs[key] = np.asarray(val)
+
+ a_this = GeometryArray(this.geometry.values)
+ data = getattr(a_this, op)(**kwargs)
+ return GeoSeries(data, index=this.index, crs=this.crs)
class GeoPandasBase(object):
-
@property
def area(self):
"""Returns a ``Series`` containing the area of each geometry in the
@@ -83,7 +168,7 @@ class GeoPandasBase(object):
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+ return _delegate_property("area", self)
@property
def crs(self):
@@ -118,12 +203,12 @@ class GeoPandasBase(object):
GeoSeries.set_crs : assign CRS
GeoSeries.to_crs : re-project to another CRS
"""
- pass
+ return self.geometry.values.crs
@crs.setter
def crs(self, value):
"""Sets the value of the crs"""
- pass
+ self.geometry.values.crs = value
@property
def geom_type(self):
@@ -143,12 +228,12 @@ class GeoPandasBase(object):
2 LineString
dtype: object
"""
- pass
+ return _delegate_property("geom_type", self)
@property
def type(self):
"""Return the geometry type of each geometry in the GeoSeries"""
- pass
+ return self.geom_type
@property
def length(self):
@@ -161,7 +246,8 @@ class GeoPandasBase(object):
Examples
--------
- >>> from shapely.geometry import Polygon, LineString, MultiLineString, Point, GeometryCollection
+ >>> from shapely.geometry import Polygon, LineString, MultiLineString, Point, \
+GeometryCollection
>>> s = geopandas.GeoSeries(
... [
... LineString([(0, 0), (1, 1), (0, 1)]),
@@ -169,7 +255,8 @@ class GeoPandasBase(object):
... MultiLineString([((0, 0), (1, 0)), ((-1, 0), (1, 0))]),
... Polygon([(0, 0), (1, 1), (0, 1)]),
... Point(0, 1),
- ... GeometryCollection([Point(1, 0), LineString([(10, 0), (10, 5), (0, 0)])])
+ ... GeometryCollection([Point(1, 0), LineString([(10, 0), (10, 5), (0,\
+ 0)])])
... ]
... )
>>> s
@@ -204,7 +291,7 @@ class GeoPandasBase(object):
dimension is not taken into account.
"""
- pass
+ return _delegate_property("length", self)
@property
def is_valid(self):
@@ -244,7 +331,7 @@ class GeoPandasBase(object):
--------
GeoSeries.is_valid_reason : reason for invalidity
"""
- pass
+ return _delegate_property("is_valid", self)
def is_valid_reason(self):
"""Returns a ``Series`` of strings with the reason for invalidity of
@@ -284,7 +371,7 @@ class GeoPandasBase(object):
GeoSeries.is_valid : detect invalid geometries
GeoSeries.make_valid : fix invalid geometries
"""
- pass
+ return Series(self.geometry.values.is_valid_reason(), index=self.index)
@property
def is_empty(self):
@@ -316,7 +403,7 @@ class GeoPandasBase(object):
--------
GeoSeries.isna : detect missing values
"""
- pass
+ return _delegate_property("is_empty", self)
def count_coordinates(self):
"""
@@ -359,7 +446,7 @@ class GeoPandasBase(object):
GeoSeries.get_coordinates : extract coordinates as a :class:`~pandas.DataFrame`
GoSeries.count_geometries : count the number of geometries in a collection
"""
- pass
+ return Series(self.geometry.values.count_coordinates(), index=self.index)
def count_geometries(self):
"""
@@ -403,7 +490,7 @@ class GeoPandasBase(object):
GeoSeries.count_coordinates : count the number of coordinates in a geometry
GeoSeries.count_interior_rings : count the number of interior rings
"""
- pass
+ return Series(self.geometry.values.count_geometries(), index=self.index)
def count_interior_rings(self):
"""
@@ -448,7 +535,7 @@ class GeoPandasBase(object):
GeoSeries.count_coordinates : count the number of coordinates in a geometry
GeoSeries.count_geometries : count the number of geometries in a collection
"""
- pass
+ return Series(self.geometry.values.count_interior_rings(), index=self.index)
@property
def is_simple(self):
@@ -476,7 +563,7 @@ class GeoPandasBase(object):
1 True
dtype: bool
"""
- pass
+ return _delegate_property("is_simple", self)
@property
def is_ring(self):
@@ -511,7 +598,7 @@ class GeoPandasBase(object):
dtype: bool
"""
- pass
+ return _delegate_property("is_ring", self)
@property
def is_ccw(self):
@@ -552,7 +639,7 @@ class GeoPandasBase(object):
3 False
dtype: bool
"""
- pass
+ return _delegate_property("is_ccw", self)
@property
def is_closed(self):
@@ -586,7 +673,7 @@ class GeoPandasBase(object):
3 False
dtype: bool
"""
- pass
+ return _delegate_property("is_closed", self)
@property
def has_z(self):
@@ -617,7 +704,7 @@ class GeoPandasBase(object):
1 True
dtype: bool
"""
- pass
+ return _delegate_property("has_z", self)
def get_precision(self):
"""Returns a ``Series`` of the precision of each geometry.
@@ -667,7 +754,7 @@ class GeoPandasBase(object):
--------
GeoSeries.set_precision : set precision grid size
"""
- pass
+ return Series(self.geometry.values.get_precision(), index=self.index)
def get_geometry(self, index):
"""Returns the n-th geometry from a collection of geometries.
@@ -723,7 +810,11 @@ class GeoPandasBase(object):
dtype: geometry
"""
- pass
+ return _delegate_geo_method("get_geometry", self, index=index)
+
+ #
+ # Unary operations that return a GeoSeries
+ #
@property
def boundary(self):
@@ -758,7 +849,7 @@ class GeoPandasBase(object):
GeoSeries.exterior : outer boundary (without interior rings)
"""
- pass
+ return _delegate_property("boundary", self)
@property
def centroid(self):
@@ -794,7 +885,7 @@ class GeoPandasBase(object):
--------
GeoSeries.representative_point : point guaranteed to be within each geometry
"""
- pass
+ return _delegate_property("centroid", self)
def concave_hull(self, ratio=0.0, allow_holes=False):
"""Returns a ``GeoSeries`` of geometries representing the concave hull
@@ -856,7 +947,9 @@ class GeoPandasBase(object):
GeoSeries.convex_hull : convex hull geometry
"""
- pass
+ return _delegate_geo_method(
+ "concave_hull", self, ratio=ratio, allow_holes=allow_holes
+ )
@property
def convex_hull(self):
@@ -903,7 +996,7 @@ class GeoPandasBase(object):
GeoSeries.envelope : bounding rectangle geometry
"""
- pass
+ return _delegate_property("convex_hull", self)
def delaunay_triangles(self, tolerance=0.0, only_edges=False):
"""Returns a ``GeoSeries`` consisting of objects representing
@@ -994,10 +1087,18 @@ class GeoPandasBase(object):
--------
GeoSeries.voronoi_polygons : Voronoi diagram around vertices
"""
- pass
+ from .geoseries import GeoSeries
- def voronoi_polygons(self, tolerance=0.0, extend_to=None, only_edges=False
- ):
+ geometry_input = shapely.geometrycollections(self.geometry.values._data)
+
+ delaunay = shapely.delaunay_triangles(
+ geometry_input,
+ tolerance=tolerance,
+ only_edges=only_edges,
+ )
+ return GeoSeries(delaunay, crs=self.crs).explode(ignore_index=True)
+
+ def voronoi_polygons(self, tolerance=0.0, extend_to=None, only_edges=False):
"""Returns a ``GeoSeries`` consisting of objects representing
the computed Voronoi diagram around the vertices of an input geometry.
@@ -1110,7 +1211,18 @@ class GeoPandasBase(object):
--------
GeoSeries.delaunay_triangles : Delaunay triangulation around vertices
"""
- pass
+ from .geoseries import GeoSeries
+
+ geometry_input = shapely.geometrycollections(self.geometry.values._data)
+
+ voronoi = shapely.voronoi_polygons(
+ geometry_input,
+ tolerance=tolerance,
+ extend_to=extend_to,
+ only_edges=only_edges,
+ )
+
+ return GeoSeries(voronoi, crs=self.crs).explode(ignore_index=True)
@property
def envelope(self):
@@ -1151,7 +1263,7 @@ class GeoPandasBase(object):
--------
GeoSeries.convex_hull : convex hull geometry
"""
- pass
+ return _delegate_property("envelope", self)
def minimum_rotated_rectangle(self):
"""Returns a ``GeoSeries`` of the general minimum bounding rectangle
@@ -1191,7 +1303,7 @@ class GeoPandasBase(object):
--------
GeoSeries.envelope : bounding rectangle
"""
- pass
+ return _delegate_geo_method("minimum_rotated_rectangle", self)
@property
def exterior(self):
@@ -1229,7 +1341,8 @@ class GeoPandasBase(object):
GeoSeries.boundary : complete set-theoretic boundary
GeoSeries.interiors : list of inner rings of each polygon
"""
- pass
+ # TODO: return empty geometry for non-polygons
+ return _delegate_property("exterior", self)
def extract_unique_points(self):
"""Returns a ``GeoSeries`` of MultiPoints representing all
@@ -1260,10 +1373,9 @@ class GeoPandasBase(object):
GeoSeries.get_coordinates : extract coordinates as a :class:`~pandas.DataFrame`
"""
- pass
+ return _delegate_geo_method("extract_unique_points", self)
- def offset_curve(self, distance, quad_segs=8, join_style='round',
- mitre_limit=5.0):
+ def offset_curve(self, distance, quad_segs=8, join_style="round", mitre_limit=5.0):
"""Returns a ``LineString`` or ``MultiLineString`` geometry at a
distance from the object on its right or its left side.
@@ -1305,7 +1417,14 @@ class GeoPandasBase(object):
0 LINESTRING (-1 0, -1 1, -0.981 1.195, -0.924 1...
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "offset_curve",
+ self,
+ distance=distance,
+ quad_segs=quad_segs,
+ join_style=join_style,
+ mitre_limit=mitre_limit,
+ )
@property
def interiors(self):
@@ -1346,7 +1465,7 @@ class GeoPandasBase(object):
--------
GeoSeries.exterior : outer boundary
"""
- pass
+ return _delegate_property("interiors", self)
def remove_repeated_points(self, tolerance=0.0):
"""Returns a ``GeoSeries`` containing a copy of the input geometry
@@ -1384,9 +1503,9 @@ class GeoPandasBase(object):
1 POLYGON ((0 0, 0 0.5, 0 1, 0.5 1, 0 0))
dtype: geometry
"""
- pass
+ return _delegate_geo_method("remove_repeated_points", self, tolerance=tolerance)
- def set_precision(self, grid_size, mode='valid_output'):
+ def set_precision(self, grid_size, mode="valid_output"):
"""Returns a ``GeoSeries`` with the precision set to a precision grid size.
By default, geometries use double precision coordinates (``grid_size=0``).
@@ -1471,7 +1590,9 @@ class GeoPandasBase(object):
:meth:`~GeoSeries.make_valid` methods.
"""
- pass
+ return _delegate_geo_method(
+ "set_precision", self, grid_size=grid_size, mode=mode
+ )
def representative_point(self):
"""Returns a ``GeoSeries`` of (cheaply computed) points that are
@@ -1504,7 +1625,7 @@ class GeoPandasBase(object):
--------
GeoSeries.centroid : geometric centroid
"""
- pass
+ return _delegate_geo_method("representative_point", self)
def minimum_bounding_circle(self):
"""Returns a ``GeoSeries`` of geometries representing the minimum bounding
@@ -1537,7 +1658,7 @@ class GeoPandasBase(object):
--------
GeoSeries.convex_hull : convex hull geometry
"""
- pass
+ return _delegate_geo_method("minimum_bounding_circle", self)
def minimum_bounding_radius(self):
"""Returns a `Series` of the radii of the minimum bounding circles
@@ -1570,7 +1691,7 @@ class GeoPandasBase(object):
GeoSeries.minumum_bounding_circle : minimum bounding circle (geometry)
"""
- pass
+ return Series(self.geometry.values.minimum_bounding_radius(), index=self.index)
def minimum_clearance(self):
"""Returns a ``Series`` containing the minimum clearance distance,
@@ -1603,7 +1724,7 @@ class GeoPandasBase(object):
2 inf
dtype: float64
"""
- pass
+ return Series(self.geometry.values.minimum_clearance(), index=self.index)
def normalize(self):
"""Returns a ``GeoSeries`` of normalized
@@ -1636,7 +1757,7 @@ class GeoPandasBase(object):
2 POINT (0 0)
dtype: geometry
"""
- pass
+ return _delegate_geo_method("normalize", self)
def make_valid(self):
"""
@@ -1674,7 +1795,7 @@ class GeoPandasBase(object):
2 LINESTRING (0 0, 1 1, 1 0)
dtype: geometry
"""
- pass
+ return _delegate_geo_method("make_valid", self)
def reverse(self):
"""Returns a ``GeoSeries`` with the order of coordinates reversed.
@@ -1706,7 +1827,7 @@ class GeoPandasBase(object):
--------
GeoSeries.normalize : normalize order of coordinates
"""
- pass
+ return _delegate_geo_method("reverse", self)
def segmentize(self, max_segment_length):
"""Returns a ``GeoSeries`` with vertices added to line segments based on
@@ -1746,7 +1867,9 @@ class GeoPandasBase(object):
1 POLYGON ((0 0, 5 0, 10 0, 10 5, 10 10, 5 10, 0...
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "segmentize", self, max_segment_length=max_segment_length
+ )
def transform(self, transformation, include_z=False):
"""Returns a ``GeoSeries`` with the transformation function
@@ -1786,7 +1909,9 @@ class GeoPandasBase(object):
0 POINT Z (1 1 1)
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "transform", self, transformation=transformation, include_z=include_z
+ )
def force_2d(self):
"""Forces the dimensionality of a geometry to 2D.
@@ -1819,7 +1944,7 @@ class GeoPandasBase(object):
2 POLYGON ((0 0, 0 10, 10 10, 0 0))
dtype: geometry
"""
- pass
+ return _delegate_geo_method("force_2d", self)
def force_3d(self, z=0):
"""Forces the dimensionality of a geometry to 3D.
@@ -1882,7 +2007,7 @@ class GeoPandasBase(object):
3 POLYGON Z ((0 0 3, 0 10 3, 10 10 3, 0 0 3))
dtype: geometry
"""
- pass
+ return _delegate_geo_method("force_3d", self, z=z)
def line_merge(self, directed=False):
"""Returns (Multi)LineStrings formed by combining the lines in a
@@ -1949,7 +2074,11 @@ class GeoPandasBase(object):
4 GEOMETRYCOLLECTION EMPTY
dtype: geometry
"""
- pass
+ return _delegate_geo_method("line_merge", self, directed=directed)
+
+ #
+ # Reduction operations that return a Shapely geometry
+ #
@property
def unary_union(self):
@@ -1977,9 +2106,17 @@ class GeoPandasBase(object):
--------
GeoSeries.union_all
"""
- pass
- def union_all(self, method='unary'):
+ warn(
+ "The 'unary_union' attribute is deprecated, "
+ "use the 'union_all()' method instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+ return self.geometry.values.union_all()
+
+ def union_all(self, method="unary"):
"""Returns a geometry containing the union of all geometries in the
``GeoSeries``.
@@ -2012,7 +2149,7 @@ class GeoPandasBase(object):
>>> s.union_all()
<POLYGON ((0 1, 0 2, 2 2, 2 0, 1 0, 0 0, 0 1))>
"""
- pass
+ return self.geometry.values.union_all(method=method)
def intersection_all(self):
"""Returns a geometry containing the intersection of all geometries in
@@ -2038,7 +2175,11 @@ class GeoPandasBase(object):
>>> s.intersection_all()
<POLYGON ((1 1, 1 1.5, 1.5 1.5, 1.5 1, 1 1))>
"""
- pass
+ return self.geometry.values.intersection_all()
+
+ #
+ # Binary operations that return a pandas Series
+ #
def contains(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2154,7 +2295,7 @@ class GeoPandasBase(object):
GeoSeries.contains_properly
GeoSeries.within
"""
- pass
+ return _binary_op("contains", self, other, align)
def contains_properly(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2275,7 +2416,7 @@ class GeoPandasBase(object):
--------
GeoSeries.contains
"""
- pass
+ return _binary_op("contains_properly", self, other, align)
def dwithin(self, other, distance, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2386,7 +2527,7 @@ class GeoPandasBase(object):
--------
GeoSeries.within
"""
- pass
+ return _binary_op("dwithin", self, other, distance=distance, align=align)
def geom_equals(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2496,7 +2637,7 @@ class GeoPandasBase(object):
GeoSeries.geom_equals_exact
"""
- pass
+ return _binary_op("geom_equals", self, other, align)
def geom_almost_equals(self, other, decimal=6, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` if
@@ -2564,7 +2705,16 @@ class GeoPandasBase(object):
GeoSeries.geom_equals_exact
"""
- pass
+ warnings.warn(
+ "The 'geom_almost_equals()' method is deprecated because the name is "
+ "confusing. The 'geom_equals_exact()' method should be used instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ tolerance = 0.5 * 10 ** (-decimal)
+ return _binary_op(
+ "geom_equals_exact", self, other, tolerance=tolerance, align=align
+ )
def geom_equals_exact(self, other, tolerance, align=None):
"""Return True for all geometries that equal aligned *other* to a given
@@ -2627,7 +2777,9 @@ class GeoPandasBase(object):
--------
GeoSeries.geom_equals
"""
- pass
+ return _binary_op(
+ "geom_equals_exact", self, other, tolerance=tolerance, align=align
+ )
def crosses(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2739,7 +2891,7 @@ class GeoPandasBase(object):
GeoSeries.intersects
"""
- pass
+ return _binary_op("crosses", self, other, align)
def disjoint(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2840,7 +2992,7 @@ class GeoPandasBase(object):
GeoSeries.touches
"""
- pass
+ return _binary_op("disjoint", self, other, align)
def intersects(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -2951,7 +3103,7 @@ class GeoPandasBase(object):
GeoSeries.touches
GeoSeries.intersection
"""
- pass
+ return _binary_op("intersects", self, other, align)
def overlaps(self, other, align=None):
"""Returns True for all aligned geometries that overlap *other*, else False.
@@ -3062,7 +3214,7 @@ class GeoPandasBase(object):
GeoSeries.intersects
"""
- pass
+ return _binary_op("overlaps", self, other, align)
def touches(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -3174,7 +3326,7 @@ class GeoPandasBase(object):
GeoSeries.intersects
"""
- pass
+ return _binary_op("touches", self, other, align)
def within(self, other, align=None):
"""Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
@@ -3288,7 +3440,7 @@ class GeoPandasBase(object):
--------
GeoSeries.contains
"""
- pass
+ return _binary_op("within", self, other, align)
def covers(self, other, align=None):
"""
@@ -3402,7 +3554,7 @@ class GeoPandasBase(object):
GeoSeries.covered_by
GeoSeries.overlaps
"""
- pass
+ return _binary_op("covers", self, other, align)
def covered_by(self, other, align=None):
"""
@@ -3516,7 +3668,7 @@ class GeoPandasBase(object):
GeoSeries.covers
GeoSeries.overlaps
"""
- pass
+ return _binary_op("covered_by", self, other, align)
def distance(self, other, align=None):
"""Returns a ``Series`` containing the distance to aligned `other`.
@@ -3612,7 +3764,7 @@ class GeoPandasBase(object):
3 1.000000
dtype: float64
"""
- pass
+ return _binary_op("distance", self, other, align)
def hausdorff_distance(self, other, align=None, densify=None):
"""Returns a ``Series`` containing the Hausdorff distance to aligned `other`.
@@ -3727,7 +3879,7 @@ class GeoPandasBase(object):
0 70.0
dtype: float64
"""
- pass
+ return _binary_op("hausdorff_distance", self, other, align, densify=densify)
def frechet_distance(self, other, align=None, densify=None):
"""Returns a ``Series`` containing the Frechet distance to aligned `other`.
@@ -3847,7 +3999,11 @@ class GeoPandasBase(object):
0 16.77051
dtype: float64
"""
- pass
+ return _binary_op("frechet_distance", self, other, align, densify=densify)
+
+ #
+ # Binary operations that return a GeoSeries
+ #
def difference(self, other, align=None):
"""Returns a ``GeoSeries`` of the points in each aligned geometry that
@@ -3958,7 +4114,7 @@ class GeoPandasBase(object):
GeoSeries.union
GeoSeries.intersection
"""
- pass
+ return _binary_geo("difference", self, other, align)
def symmetric_difference(self, other, align=None):
"""Returns a ``GeoSeries`` of the symmetric difference of points in
@@ -4073,7 +4229,7 @@ class GeoPandasBase(object):
GeoSeries.union
GeoSeries.intersection
"""
- pass
+ return _binary_geo("symmetric_difference", self, other, align)
def union(self, other, align=None):
"""Returns a ``GeoSeries`` of the union of points in each aligned geometry with
@@ -4187,7 +4343,7 @@ class GeoPandasBase(object):
GeoSeries.difference
GeoSeries.intersection
"""
- pass
+ return _binary_geo("union", self, other, align)
def intersection(self, other, align=None):
"""Returns a ``GeoSeries`` of the intersection of points in each
@@ -4300,7 +4456,7 @@ class GeoPandasBase(object):
GeoSeries.symmetric_difference
GeoSeries.union
"""
- pass
+ return _binary_geo("intersection", self, other, align)
def clip_by_rect(self, xmin, ymin, xmax, ymax):
"""Returns a ``GeoSeries`` of the portions of geometry within the given
@@ -4365,7 +4521,11 @@ class GeoPandasBase(object):
--------
GeoSeries.intersection
"""
- pass
+ from .geoseries import GeoSeries
+
+ geometry_array = GeometryArray(self.geometry.values)
+ clipped_geometry = geometry_array.clip_by_rect(xmin, ymin, xmax, ymax)
+ return GeoSeries(clipped_geometry, index=self.index, crs=self.crs)
def shortest_line(self, other, align=None):
"""
@@ -4468,7 +4628,7 @@ class GeoPandasBase(object):
4 LINESTRING (0 1, 0 1)
dtype: geometry
"""
- pass
+ return _binary_geo("shortest_line", self, other, align)
def snap(self, other, tolerance, align=None):
"""Snaps an input geometry to reference geometry's vertices.
@@ -4569,7 +4729,7 @@ class GeoPandasBase(object):
2 POLYGON ((0 0, 0 10, 8 10, 10 10, 10 0, 0 0))
dtype: geometry
"""
- pass
+ return _binary_geo("snap", self, other, align, tolerance=tolerance)
def shared_paths(self, other, align=None):
"""
@@ -4667,7 +4827,12 @@ class GeoPandasBase(object):
--------
GeoSeries.get_geometry
"""
- pass
+
+ return _binary_geo("shared_paths", self, other, align)
+
+ #
+ # Other operations
+ #
@property
def bounds(self):
@@ -4698,7 +4863,10 @@ class GeoPandasBase(object):
1 POLYGON ((0 0, 1 1, 1 0, 0 0)) 0.0 0.0 1.0 1.0
2 LINESTRING (0 1, 1 2) 0.0 1.0 1.0 2.0
"""
- pass
+ bounds = GeometryArray(self.geometry.values).bounds
+ return DataFrame(
+ bounds, columns=["minx", "miny", "maxx", "maxy"], index=self.index
+ )
@property
def total_bounds(self):
@@ -4717,7 +4885,7 @@ class GeoPandasBase(object):
>>> gdf.total_bounds
array([ 0., -1., 3., 2.])
"""
- pass
+ return GeometryArray(self.geometry.values).total_bounds
@property
def sindex(self):
@@ -4769,7 +4937,7 @@ class GeoPandasBase(object):
array([[0],
[2]])
"""
- pass
+ return self.geometry.values.sindex
@property
def has_sindex(self):
@@ -4801,10 +4969,18 @@ class GeoPandasBase(object):
`True` if the spatial index has been generated or
`False` if not.
"""
- pass
-
- def buffer(self, distance, resolution=16, cap_style='round', join_style
- ='round', mitre_limit=5.0, single_sided=False, **kwargs):
+ return self.geometry.values.has_sindex
+
+ def buffer(
+ self,
+ distance,
+ resolution=16,
+ cap_style="round",
+ join_style="round",
+ mitre_limit=5.0,
+ single_sided=False,
+ **kwargs,
+ ):
"""Returns a ``GeoSeries`` of geometries representing all points within
a given ``distance`` of each geometric object.
@@ -4869,7 +5045,17 @@ class GeoPandasBase(object):
.. plot:: _static/code/buffer.py
"""
- pass
+ return _delegate_geo_method(
+ "buffer",
+ self,
+ distance=distance,
+ resolution=resolution,
+ cap_style=cap_style,
+ join_style=join_style,
+ mitre_limit=mitre_limit,
+ single_sided=single_sided,
+ **kwargs,
+ )
def simplify(self, tolerance, preserve_topology=True):
"""Returns a ``GeoSeries`` containing a simplified representation of
@@ -4919,7 +5105,9 @@ class GeoPandasBase(object):
1 LINESTRING (0 0, 0 20)
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "simplify", self, tolerance=tolerance, preserve_topology=preserve_topology
+ )
def relate(self, other, align=None):
"""
@@ -5024,7 +5212,7 @@ class GeoPandasBase(object):
dtype: object
"""
- pass
+ return _binary_op("relate", self, other, align)
def relate_pattern(self, other, pattern, align=None):
"""
@@ -5136,7 +5324,7 @@ class GeoPandasBase(object):
dtype: bool
"""
- pass
+ return _binary_op("relate_pattern", self, other, pattern=pattern, align=align)
def project(self, other, normalized=False, align=None):
"""
@@ -5235,7 +5423,7 @@ class GeoPandasBase(object):
--------
GeoSeries.interpolate
"""
- pass
+ return _binary_op("project", self, other, normalized=normalized, align=align)
def interpolate(self, distance, normalized=False):
"""
@@ -5279,7 +5467,9 @@ class GeoPandasBase(object):
2 POINT (0 2)
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "interpolate", self, distance=distance, normalized=normalized
+ )
def affine_transform(self, matrix):
"""Return a ``GeoSeries`` with translated geometries.
@@ -5320,8 +5510,8 @@ class GeoPandasBase(object):
2 POLYGON ((8 4, 13 10, 14 12, 8 4))
dtype: geometry
- """
- pass
+ """ # (E501 link is longer than max line length)
+ return _delegate_geo_method("affine_transform", self, matrix=matrix)
def translate(self, xoff=0.0, yoff=0.0, zoff=0.0):
"""Returns a ``GeoSeries`` with translated geometries.
@@ -5358,10 +5548,10 @@ class GeoPandasBase(object):
2 POLYGON ((5 2, 6 3, 5 4, 5 2))
dtype: geometry
- """
- pass
+ """ # (E501 link is longer than max line length)
+ return _delegate_geo_method("translate", self, xoff=xoff, yoff=yoff, zoff=zoff)
- def rotate(self, angle, origin='center', use_radians=False):
+ def rotate(self, angle, origin="center", use_radians=False):
"""Returns a ``GeoSeries`` with rotated geometries.
See http://shapely.readthedocs.io/en/latest/manual.html#shapely.affinity.rotate
@@ -5409,9 +5599,11 @@ class GeoPandasBase(object):
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "rotate", self, angle=angle, origin=origin, use_radians=use_radians
+ )
- def scale(self, xfact=1.0, yfact=1.0, zfact=1.0, origin='center'):
+ def scale(self, xfact=1.0, yfact=1.0, zfact=1.0, origin="center"):
"""Returns a ``GeoSeries`` with scaled geometries.
The geometries can be scaled by different factors along each
@@ -5457,9 +5649,11 @@ class GeoPandasBase(object):
2 POLYGON ((6 -3, 8 0, 6 3, 6 -3))
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "scale", self, xfact=xfact, yfact=yfact, zfact=zfact, origin=origin
+ )
- def skew(self, xs=0.0, ys=0.0, origin='center', use_radians=False):
+ def skew(self, xs=0.0, ys=0.0, origin="center", use_radians=False):
"""Returns a ``GeoSeries`` with skewed geometries.
The geometries are sheared by angles along the x and y dimensions.
@@ -5508,7 +5702,9 @@ class GeoPandasBase(object):
2 POLYGON ((2 0.73205, 4 2.3094, 4 2.73205, 2 0....
dtype: geometry
"""
- pass
+ return _delegate_geo_method(
+ "skew", self, xs=xs, ys=ys, origin=origin, use_radians=use_radians
+ )
@property
def cx(self):
@@ -5545,10 +5741,9 @@ class GeoPandasBase(object):
dtype: geometry
"""
- pass
+ return _CoordinateIndexer(self)
- def get_coordinates(self, include_z=False, ignore_index=False,
- index_parts=False):
+ def get_coordinates(self, include_z=False, ignore_index=False, index_parts=False):
"""Gets coordinates from a :class:`GeoSeries` as a :class:`~pandas.DataFrame` of
floats.
@@ -5619,7 +5814,22 @@ class GeoPandasBase(object):
2 3.0 1.0
3 3.0 -1.0
"""
- pass
+ coords, outer_idx = shapely.get_coordinates(
+ self.geometry.values._data, include_z=include_z, return_index=True
+ )
+
+ column_names = ["x", "y"]
+ if include_z:
+ column_names.append("z")
+
+ index = _get_index_for_parts(
+ self.index,
+ outer_idx,
+ ignore_index=ignore_index,
+ index_parts=index_parts,
+ )
+
+ return pd.DataFrame(coords, index=index, columns=column_names)
def hilbert_distance(self, total_bounds=None, level=16):
"""
@@ -5647,10 +5857,15 @@ class GeoPandasBase(object):
Series
Series containing distance along the curve for geometry
"""
- pass
+ from geopandas.tools.hilbert_curve import _hilbert_distance
+
+ distances = _hilbert_distance(
+ self.geometry.values, total_bounds=total_bounds, level=level
+ )
- def sample_points(self, size, method='uniform', seed=None, rng=None, **
- kwargs):
+ return pd.Series(distances, index=self.index, name="hilbert_distance")
+
+ def sample_points(self, size, method="uniform", seed=None, rng=None, **kwargs):
"""
Sample points from each geometry.
@@ -5706,8 +5921,50 @@ class GeoPandasBase(object):
0 MULTIPOINT ((0.1045 -0.10294), (0.35249 -0.264...
1 MULTIPOINT ((3.03261 -0.43069), (3.10068 0.114...
Name: sampled_points, dtype: geometry
- """
- pass
+ """ # noqa: E501
+ from .geoseries import GeoSeries
+ from .tools._random import uniform
+
+ if seed is not None:
+ warn(
+ "The 'seed' keyword is deprecated. Use 'rng' instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ rng = seed
+
+ if method == "uniform":
+ if pd.api.types.is_list_like(size):
+ result = [uniform(geom, s, rng) for geom, s in zip(self.geometry, size)]
+ else:
+ result = self.geometry.apply(uniform, size=size, rng=rng)
+
+ else:
+ pointpats = compat.import_optional_dependency(
+ "pointpats",
+ f"For complex sampling methods, the pointpats module is required. "
+ f"Your requested method, '{method}' was not a supported option "
+ f"and the pointpats package was not able to be imported.",
+ )
+
+ if not hasattr(pointpats.random, method):
+ raise AttributeError(
+ f"pointpats.random module has no sampling method {method}."
+ f"Consult the pointpats.random module documentation for"
+ f" available random sampling methods."
+ )
+ sample_function = getattr(pointpats.random, method)
+ result = self.geometry.apply(
+ lambda x: (
+ points_from_xy(
+ *sample_function(x, size=size, **kwargs).T
+ ).union_all()
+ if not (x.is_empty or x is None or "Polygon" not in x.geom_type)
+ else MultiPoint()
+ ),
+ )
+
+ return GeoSeries(result, name="sampled_points", crs=self.crs, index=self.index)
def build_area(self, node=True):
"""Creates an areal geometry formed by the constituent linework.
@@ -5758,7 +6015,17 @@ class GeoPandasBase(object):
Name: polygons, dtype: geometry
"""
- pass
+ from .geoseries import GeoSeries
+
+ if node:
+ geometry_input = self.geometry.union_all()
+ else:
+ geometry_input = shapely.geometrycollections(self.geometry.values._data)
+
+ polygons = shapely.build_area(geometry_input)
+ return GeoSeries(polygons, crs=self.crs, name="polygons").explode(
+ ignore_index=True
+ )
def polygonize(self, node=True, full=False):
"""Creates polygons formed from the linework of a GeoSeries.
@@ -5817,7 +6084,35 @@ class GeoPandasBase(object):
>>> polygons, cuts, dangles, invalid = s.polygonize(full=True)
"""
- pass
+ from .geoseries import GeoSeries
+
+ if node:
+ geometry_input = [self.geometry.union_all()]
+ else:
+ geometry_input = self.geometry.values
+
+ if full:
+ polygons, cuts, dangles, invalid = shapely.polygonize_full(geometry_input)
+
+ cuts = GeoSeries(cuts, crs=self.crs, name="cut_edges").explode(
+ ignore_index=True
+ )
+ dangles = GeoSeries(dangles, crs=self.crs, name="dangles").explode(
+ ignore_index=True
+ )
+ invalid = GeoSeries(invalid, crs=self.crs, name="invalid_rings").explode(
+ ignore_index=True
+ )
+ polygons = GeoSeries(polygons, crs=self.crs, name="polygons").explode(
+ ignore_index=True
+ )
+
+ return (polygons, cuts, dangles, invalid)
+
+ polygons = shapely.polygonize(geometry_input)
+ return GeoSeries(polygons, crs=self.crs, name="polygons").explode(
+ ignore_index=True
+ )
def _get_index_for_parts(orig_idx, outer_idx, ignore_index, index_parts):
@@ -5839,10 +6134,44 @@ def _get_index_for_parts(orig_idx, outer_idx, ignore_index, index_parts):
pandas.Index
index or multiindex
"""
- pass
+
+ if ignore_index:
+ return None
+ else:
+ if len(outer_idx):
+ # Generate inner index as a range per value of outer_idx
+ # 1. identify the start of each run of values in outer_idx
+ # 2. count number of values per run
+ # 3. use cumulative sums to create an incremental range
+ # starting at 0 in each run
+ run_start = np.r_[True, outer_idx[:-1] != outer_idx[1:]]
+ counts = np.diff(np.r_[np.nonzero(run_start)[0], len(outer_idx)])
+ inner_index = (~run_start).cumsum(dtype=outer_idx.dtype)
+ inner_index -= np.repeat(inner_index[run_start], counts)
+
+ else:
+ inner_index = []
+
+ # extract original index values based on integer index
+ outer_index = orig_idx.take(outer_idx)
+
+ if index_parts:
+ nlevels = outer_index.nlevels
+ index_arrays = [outer_index.get_level_values(lvl) for lvl in range(nlevels)]
+ index_arrays.append(inner_index)
+
+ index = pd.MultiIndex.from_arrays(
+ index_arrays, names=list(orig_idx.names) + [None]
+ )
+
+ else:
+ index = outer_index
+
+ return index
class _CoordinateIndexer(object):
+ # see docstring GeoPandasBase.cx property above
def __init__(self, obj):
self.obj = obj
@@ -5850,17 +6179,24 @@ class _CoordinateIndexer(object):
def __getitem__(self, key):
obj = self.obj
xs, ys = key
+ # handle numeric values as x and/or y coordinate index
if type(xs) is not slice:
xs = slice(xs, xs)
if type(ys) is not slice:
ys = slice(ys, ys)
+ # don't know how to handle step; should this raise?
if xs.step is not None or ys.step is not None:
- warn('Ignoring step - full interval is used.', stacklevel=2)
- if (xs.start is None or xs.stop is None or ys.start is None or ys.
- stop is None):
+ warn(
+ "Ignoring step - full interval is used.",
+ stacklevel=2,
+ )
+ if xs.start is None or xs.stop is None or ys.start is None or ys.stop is None:
xmin, ymin, xmax, ymax = obj.total_bounds
- bbox = box(xs.start if xs.start is not None else xmin, ys.start if
- ys.start is not None else ymin, xs.stop if xs.stop is not None else
- xmax, ys.stop if ys.stop is not None else ymax)
+ bbox = box(
+ xs.start if xs.start is not None else xmin,
+ ys.start if ys.start is not None else ymin,
+ xs.stop if xs.stop is not None else xmax,
+ ys.stop if ys.stop is not None else ymax,
+ )
idx = obj.intersects(bbox)
return obj[idx]
diff --git a/geopandas/explore.py b/geopandas/explore.py
index dbf38bd5..652dd6dd 100644
--- a/geopandas/explore.py
+++ b/geopandas/explore.py
@@ -1,24 +1,70 @@
import warnings
from packaging.version import Version
from statistics import mean
+
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype
+
from shapely.geometry import LineString
+
import geopandas
-_MAP_KWARGS = ['location', 'prefer_canvas', 'no_touch', 'disable_3d',
- 'png_enabled', 'zoom_control', 'crs', 'zoom_start', 'left', 'top',
- 'position', 'min_zoom', 'max_zoom', 'min_lat', 'max_lat', 'min_lon',
- 'max_lon', 'max_bounds']
-
-
-def _explore(df, column=None, cmap=None, color=None, m=None, tiles=
- 'OpenStreetMap', attr=None, tooltip=True, popup=False, highlight=True,
- categorical=False, legend=True, scheme=None, k=5, vmin=None, vmax=None,
- width='100%', height='100%', categories=None, classification_kwds=None,
- control_scale=True, marker_type=None, marker_kwds={}, style_kwds={},
- highlight_kwds={}, missing_kwds={}, tooltip_kwds={}, popup_kwds={},
- legend_kwds={}, map_kwds={}, **kwargs):
+
+_MAP_KWARGS = [
+ "location",
+ "prefer_canvas",
+ "no_touch",
+ "disable_3d",
+ "png_enabled",
+ "zoom_control",
+ "crs",
+ "zoom_start",
+ "left",
+ "top",
+ "position",
+ "min_zoom",
+ "max_zoom",
+ "min_lat",
+ "max_lat",
+ "min_lon",
+ "max_lon",
+ "max_bounds",
+]
+
+
+def _explore(
+ df,
+ column=None,
+ cmap=None,
+ color=None,
+ m=None,
+ tiles="OpenStreetMap",
+ attr=None,
+ tooltip=True,
+ popup=False,
+ highlight=True,
+ categorical=False,
+ legend=True,
+ scheme=None,
+ k=5,
+ vmin=None,
+ vmax=None,
+ width="100%",
+ height="100%",
+ categories=None,
+ classification_kwds=None,
+ control_scale=True,
+ marker_type=None,
+ marker_kwds={},
+ style_kwds={},
+ highlight_kwds={},
+ missing_kwds={},
+ tooltip_kwds={},
+ popup_kwds={},
+ legend_kwds={},
+ map_kwds={},
+ **kwargs,
+):
"""Interactive map based on GeoPandas and folium/leaflet.js
Generate an interactive leaflet map based on :class:`~geopandas.GeoDataFrame`
@@ -222,12 +268,524 @@ def _explore(df, column=None, cmap=None, color=None, m=None, tiles=
>>> df.explore("Pop2012", cmap="Blues") # doctest: +SKIP
"""
- pass
+
+ def _colormap_helper(_cmap, n_resample=None, idx=None):
+ """Helper for MPL deprecation - GH#2596"""
+ if not n_resample:
+ return cm.get_cmap(_cmap)
+ else:
+ if MPL_361:
+ return cm.get_cmap(_cmap).resampled(n_resample)(idx)
+ else:
+ return cm.get_cmap(_cmap, n_resample)(idx)
+
+ try:
+ import re
+
+ import branca as bc
+ import folium
+ import matplotlib
+ import matplotlib.pyplot as plt
+ from mapclassify import classify
+ from matplotlib import colors
+
+ # isolate MPL version - GH#2596
+ MPL_361 = Version(matplotlib.__version__) >= Version("3.6.1")
+ if MPL_361:
+ from matplotlib import colormaps as cm
+ else:
+ from matplotlib import cm
+
+ except (ImportError, ModuleNotFoundError):
+ raise ImportError(
+ "The 'folium', 'matplotlib' and 'mapclassify' packages are required for "
+ "'explore()'. You can install them using "
+ "'conda install -c conda-forge folium matplotlib mapclassify' "
+ "or 'pip install folium matplotlib mapclassify'."
+ )
+
+ # xyservices is an optional dependency
+ try:
+ import xyzservices
+
+ HAS_XYZSERVICES = True
+ except (ImportError, ModuleNotFoundError):
+ HAS_XYZSERVICES = False
+
+ gdf = df.copy()
+
+ # convert LinearRing to LineString
+ rings_mask = df.geom_type == "LinearRing"
+ if rings_mask.any():
+ gdf.geometry[rings_mask] = gdf.geometry[rings_mask].apply(
+ lambda g: LineString(g)
+ )
+ if isinstance(gdf, geopandas.GeoSeries):
+ gdf = gdf.to_frame()
+
+ if gdf.crs is None:
+ kwargs["crs"] = "Simple"
+ tiles = None
+ elif not gdf.crs.equals(4326):
+ gdf = gdf.to_crs(4326)
+
+ # Fields which are not JSON serializable are coerced to strings
+ json_not_supported_cols = gdf.columns[
+ [is_datetime64_any_dtype(gdf[c]) for c in gdf.columns]
+ ].union(gdf.columns[gdf.dtypes == "object"])
+
+ if len(json_not_supported_cols) > 0:
+ gdf = gdf.astype({c: "string" for c in json_not_supported_cols})
+
+ if not isinstance(gdf.index, pd.MultiIndex) and (
+ is_datetime64_any_dtype(gdf.index) or (gdf.index.dtype == "object")
+ ):
+ gdf.index = gdf.index.astype("string")
+
+ # create folium.Map object
+ if m is None:
+ # Get bounds to specify location and map extent
+ bounds = gdf.total_bounds
+ location = kwargs.pop("location", None)
+ if location is None and not np.isnan(bounds).all():
+ x = mean([bounds[0], bounds[2]])
+ y = mean([bounds[1], bounds[3]])
+ location = (y, x)
+ if "zoom_start" in kwargs.keys():
+ fit = False
+ else:
+ fit = True
+ else:
+ fit = False
+
+ # get a subset of kwargs to be passed to folium.Map
+ for i in _MAP_KWARGS:
+ if i in map_kwds:
+ raise ValueError(
+ f"'{i}' cannot be specified in 'map_kwds'. "
+ f"Use the '{i}={map_kwds[i]}' argument instead."
+ )
+ map_kwds = {
+ **map_kwds,
+ **{i: kwargs[i] for i in kwargs.keys() if i in _MAP_KWARGS},
+ }
+
+ if HAS_XYZSERVICES:
+ # match provider name string to xyzservices.TileProvider
+ if isinstance(tiles, str):
+ try:
+ tiles = xyzservices.providers.query_name(tiles)
+ except ValueError:
+ pass
+
+ if isinstance(tiles, xyzservices.TileProvider):
+ attr = attr if attr else tiles.html_attribution
+ if "min_zoom" not in map_kwds:
+ map_kwds["min_zoom"] = tiles.get("min_zoom", 0)
+ if "max_zoom" not in map_kwds:
+ map_kwds["max_zoom"] = tiles.get("max_zoom", 18)
+ tiles = tiles.build_url(scale_factor="{r}")
+
+ m = folium.Map(
+ location=location,
+ control_scale=control_scale,
+ tiles=tiles,
+ attr=attr,
+ width=width,
+ height=height,
+ **map_kwds,
+ )
+
+ # fit bounds to get a proper zoom level
+ if fit:
+ m.fit_bounds([[bounds[1], bounds[0]], [bounds[3], bounds[2]]])
+
+ if gdf.is_empty.all():
+ warnings.warn(
+ "The GeoSeries you are attempting to plot is "
+ "composed of empty geometries. Nothing has been displayed.",
+ UserWarning,
+ stacklevel=3,
+ )
+ return m
+
+ for map_kwd in _MAP_KWARGS:
+ kwargs.pop(map_kwd, None)
+
+ nan_idx = None
+
+ if column is not None:
+ if pd.api.types.is_list_like(column):
+ if len(column) != gdf.shape[0]:
+ raise ValueError(
+ "The GeoDataFrame and given column have different number of rows."
+ )
+ else:
+ column_name = "__plottable_column"
+ gdf[column_name] = column
+ column = column_name
+ elif isinstance(gdf[column].dtype, pd.CategoricalDtype):
+ if categories is not None:
+ raise ValueError(
+ "Cannot specify 'categories' when column has categorical dtype"
+ )
+ categorical = True
+ elif (
+ pd.api.types.is_object_dtype(gdf[column])
+ or pd.api.types.is_bool_dtype(gdf[column])
+ or pd.api.types.is_string_dtype(gdf[column])
+ or categories
+ ):
+ categorical = True
+
+ nan_idx = pd.isna(gdf[column])
+
+ if categorical:
+ cat = pd.Categorical(gdf[column][~nan_idx], categories=categories)
+ N = len(cat.categories)
+ cmap = cmap if cmap else "tab20"
+
+ # colormap exists in matplotlib
+ if cmap in plt.colormaps():
+ color = np.apply_along_axis(
+ colors.to_hex,
+ 1,
+ _colormap_helper(cmap, n_resample=N, idx=cat.codes),
+ )
+ legend_colors = np.apply_along_axis(
+ colors.to_hex, 1, _colormap_helper(cmap, n_resample=N, idx=range(N))
+ )
+
+ # colormap is matplotlib.Colormap
+ elif isinstance(cmap, colors.Colormap):
+ color = np.apply_along_axis(colors.to_hex, 1, cmap(cat.codes))
+ legend_colors = np.apply_along_axis(colors.to_hex, 1, cmap(range(N)))
+
+ # custom list of colors
+ elif pd.api.types.is_list_like(cmap):
+ if N > len(cmap):
+ cmap = cmap * (N // len(cmap) + 1)
+ color = np.take(cmap, cat.codes)
+ legend_colors = np.take(cmap, range(N))
+
+ else:
+ raise ValueError(
+ "'cmap' is invalid. For categorical plots, pass either valid "
+ "named matplotlib colormap or a list-like of colors."
+ )
+
+ elif callable(cmap):
+ # List of colors based on Branca colormaps or self-defined functions
+ color = [cmap(x) for x in df[column]]
+
+ else:
+ vmin = gdf[column].min() if vmin is None else vmin
+ vmax = gdf[column].max() if vmax is None else vmax
+
+ # get bins
+ if scheme is not None:
+ if classification_kwds is None:
+ classification_kwds = {}
+ if "k" not in classification_kwds:
+ classification_kwds["k"] = k
+
+ binning = classify(
+ np.asarray(gdf[column][~nan_idx]), scheme, **classification_kwds
+ )
+ color = np.apply_along_axis(
+ colors.to_hex,
+ 1,
+ _colormap_helper(cmap, n_resample=binning.k, idx=binning.yb),
+ )
+
+ else:
+ bins = np.linspace(vmin, vmax, 257)[1:]
+ binning = classify(
+ np.asarray(gdf[column][~nan_idx]), "UserDefined", bins=bins
+ )
+
+ color = np.apply_along_axis(
+ colors.to_hex,
+ 1,
+ _colormap_helper(cmap, n_resample=256, idx=binning.yb),
+ )
+
+ # set default style
+ if "fillOpacity" not in style_kwds:
+ style_kwds["fillOpacity"] = 0.5
+ if "weight" not in style_kwds:
+ style_kwds["weight"] = 2
+ if "style_function" in style_kwds:
+ style_kwds_function = style_kwds["style_function"]
+ if not callable(style_kwds_function):
+ raise ValueError("'style_function' has to be a callable")
+ style_kwds.pop("style_function")
+ else:
+
+ def _no_style(x):
+ return {}
+
+ style_kwds_function = _no_style
+
+ # specify color
+ if color is not None:
+ if (
+ isinstance(color, str)
+ and isinstance(gdf, geopandas.GeoDataFrame)
+ and color in gdf.columns
+ ): # use existing column
+
+ def _style_color(x):
+ base_style = {
+ "fillColor": x["properties"][color],
+ **style_kwds,
+ }
+ return {
+ **base_style,
+ **style_kwds_function(x),
+ }
+
+ style_function = _style_color
+ else: # assign new column
+ if isinstance(gdf, geopandas.GeoSeries):
+ gdf = geopandas.GeoDataFrame(geometry=gdf)
+
+ if nan_idx is not None and nan_idx.any():
+ nan_color = missing_kwds.pop("color", None)
+
+ gdf["__folium_color"] = nan_color
+ gdf.loc[~nan_idx, "__folium_color"] = color
+ else:
+ gdf["__folium_color"] = color
+
+ stroke_color = style_kwds.pop("color", None)
+ if not stroke_color:
+
+ def _style_column(x):
+ base_style = {
+ "fillColor": x["properties"]["__folium_color"],
+ "color": x["properties"]["__folium_color"],
+ **style_kwds,
+ }
+ return {
+ **base_style,
+ **style_kwds_function(x),
+ }
+
+ style_function = _style_column
+ else:
+
+ def _style_stroke(x):
+ base_style = {
+ "fillColor": x["properties"]["__folium_color"],
+ "color": stroke_color,
+ **style_kwds,
+ }
+ return {
+ **base_style,
+ **style_kwds_function(x),
+ }
+
+ style_function = _style_stroke
+ else: # use folium default
+
+ def _style_default(x):
+ return {**style_kwds, **style_kwds_function(x)}
+
+ style_function = _style_default
+
+ if highlight:
+ if "fillOpacity" not in highlight_kwds:
+ highlight_kwds["fillOpacity"] = 0.75
+
+ def _style_highlight(x):
+ return {**highlight_kwds}
+
+ highlight_function = _style_highlight
+ else:
+ highlight_function = None
+
+ # define default for points
+ if marker_type is None:
+ marker_type = "circle_marker"
+
+ marker = marker_type
+ if isinstance(marker_type, str):
+ if marker_type == "marker":
+ marker = folium.Marker(**marker_kwds)
+ elif marker_type == "circle":
+ marker = folium.Circle(**marker_kwds)
+ elif marker_type == "circle_marker":
+ marker_kwds["radius"] = marker_kwds.get("radius", 2)
+ marker_kwds["fill"] = marker_kwds.get("fill", True)
+ marker = folium.CircleMarker(**marker_kwds)
+ else:
+ raise ValueError(
+ "Only 'marker', 'circle', and 'circle_marker' are "
+ "supported as marker values"
+ )
+
+ # remove additional geometries
+ if isinstance(gdf, geopandas.GeoDataFrame):
+ non_active_geoms = [
+ name
+ for name, val in (gdf.dtypes == "geometry").items()
+ if val and name != gdf.geometry.name
+ ]
+ gdf = gdf.drop(columns=non_active_geoms)
+
+ # prepare tooltip and popup
+ if isinstance(gdf, geopandas.GeoDataFrame):
+ # add named index to the tooltip
+ if gdf.index.name is not None:
+ gdf = gdf.reset_index()
+ # specify fields to show in the tooltip
+ tooltip = _tooltip_popup("tooltip", tooltip, gdf, **tooltip_kwds)
+ popup = _tooltip_popup("popup", popup, gdf, **popup_kwds)
+ else:
+ tooltip = None
+ popup = None
+ # escape the curly braces {{}} for jinja2 templates
+ feature_collection = gdf[
+ ~(gdf.geometry.isna() | gdf.geometry.is_empty) # drop missing or empty geoms
+ ].__geo_interface__
+ for feature in feature_collection["features"]:
+ for k in feature["properties"]:
+ # escape the curly braces in values
+ if isinstance(feature["properties"][k], str):
+ feature["properties"][k] = re.sub(
+ r"\{{2,}",
+ lambda x: "{% raw %}" + x.group(0) + "{% endraw %}",
+ feature["properties"][k],
+ )
+
+ # add dataframe to map
+ folium.GeoJson(
+ feature_collection,
+ tooltip=tooltip,
+ popup=popup,
+ marker=marker,
+ style_function=style_function,
+ highlight_function=highlight_function,
+ **kwargs,
+ ).add_to(m)
+
+ if legend:
+ # NOTE: overlaps will be resolved in branca #88
+ caption = column if not column == "__plottable_column" else ""
+ caption = legend_kwds.pop("caption", caption)
+ if categorical:
+ categories = cat.categories.to_list()
+ legend_colors = legend_colors.tolist()
+
+ if nan_idx.any() and nan_color:
+ categories.append(missing_kwds.pop("label", "NaN"))
+ legend_colors.append(nan_color)
+
+ _categorical_legend(m, caption, categories, legend_colors)
+ elif column is not None:
+ cbar = legend_kwds.pop("colorbar", True)
+ colormap_kwds = {}
+ if "max_labels" in legend_kwds:
+ colormap_kwds["max_labels"] = legend_kwds.pop("max_labels")
+ if scheme:
+ cb_colors = np.apply_along_axis(
+ colors.to_hex,
+ 1,
+ _colormap_helper(cmap, n_resample=binning.k, idx=range(binning.k)),
+ )
+ if cbar:
+ if legend_kwds.pop("scale", True):
+ index = [vmin] + binning.bins.tolist()
+ else:
+ index = None
+ colorbar = bc.colormap.StepColormap(
+ cb_colors,
+ vmin=vmin,
+ vmax=vmax,
+ caption=caption,
+ index=index,
+ **colormap_kwds,
+ )
+ else:
+ fmt = legend_kwds.pop("fmt", "{:.2f}")
+ if "labels" in legend_kwds:
+ categories = legend_kwds["labels"]
+ else:
+ categories = binning.get_legend_classes(fmt)
+ show_interval = legend_kwds.pop("interval", False)
+ if not show_interval:
+ categories = [c[1:-1] for c in categories]
+
+ if nan_idx.any() and nan_color:
+ categories.append(missing_kwds.pop("label", "NaN"))
+ cb_colors = np.append(cb_colors, nan_color)
+ _categorical_legend(m, caption, categories, cb_colors)
+
+ else:
+ if isinstance(cmap, bc.colormap.ColorMap):
+ colorbar = cmap
+ else:
+ mp_cmap = _colormap_helper(cmap)
+ cb_colors = np.apply_along_axis(
+ colors.to_hex, 1, mp_cmap(range(mp_cmap.N))
+ )
+
+ # linear legend
+ if mp_cmap.N > 20:
+ colorbar = bc.colormap.LinearColormap(
+ cb_colors,
+ vmin=vmin,
+ vmax=vmax,
+ caption=caption,
+ **colormap_kwds,
+ )
+
+ # steps
+ else:
+ colorbar = bc.colormap.StepColormap(
+ cb_colors,
+ vmin=vmin,
+ vmax=vmax,
+ caption=caption,
+ **colormap_kwds,
+ )
+
+ if cbar:
+ if nan_idx.any() and nan_color:
+ _categorical_legend(
+ m, "", [missing_kwds.pop("label", "NaN")], [nan_color]
+ )
+ m.add_child(colorbar)
+
+ return m
def _tooltip_popup(type, fields, gdf, **kwds):
"""get tooltip or popup"""
- pass
+ import folium
+
+ # specify fields to show in the tooltip
+ if fields is False or fields is None or fields == 0:
+ return None
+ else:
+ if fields is True:
+ fields = gdf.columns.drop(gdf.geometry.name).to_list()
+ elif isinstance(fields, int):
+ fields = gdf.columns.drop(gdf.geometry.name).to_list()[:fields]
+ elif isinstance(fields, str):
+ fields = [fields]
+
+ for field in ["__plottable_column", "__folium_color"]:
+ if field in fields:
+ fields.remove(field)
+
+ # Cast fields to str
+ fields = list(map(str, fields))
+ if type == "tooltip":
+ return folium.GeoJsonTooltip(fields, **kwds)
+ elif type == "popup":
+ return folium.GeoJsonPopup(fields, **kwds)
def _categorical_legend(m, title, categories, colors):
@@ -251,13 +809,118 @@ def _categorical_legend(m, title, categories, colors):
colors : list-like
list of colors (in the same order as categories)
"""
- pass
+
+ # Header to Add
+ head = """
+ {% macro header(this, kwargs) %}
+ <script src="https://code.jquery.com/ui/1.12.1/jquery-ui.js"></script>
+ <script>$( function() {
+ $( ".maplegend" ).draggable({
+ start: function (event, ui) {
+ $(this).css({
+ right: "auto",
+ top: "auto",
+ bottom: "auto"
+ });
+ }
+ });
+ });
+ </script>
+ <style type='text/css'>
+ .maplegend {
+ position: absolute;
+ z-index:9999;
+ background-color: rgba(255, 255, 255, .8);
+ border-radius: 5px;
+ box-shadow: 0 0 15px rgba(0,0,0,0.2);
+ padding: 10px;
+ font: 12px/14px Arial, Helvetica, sans-serif;
+ right: 10px;
+ bottom: 20px;
+ }
+ .maplegend .legend-title {
+ text-align: left;
+ margin-bottom: 5px;
+ font-weight: bold;
+ }
+ .maplegend .legend-scale ul {
+ margin: 0;
+ margin-bottom: 0px;
+ padding: 0;
+ float: left;
+ list-style: none;
+ }
+ .maplegend .legend-scale ul li {
+ list-style: none;
+ margin-left: 0;
+ line-height: 16px;
+ margin-bottom: 2px;
+ }
+ .maplegend ul.legend-labels li span {
+ display: block;
+ float: left;
+ height: 14px;
+ width: 14px;
+ margin-right: 5px;
+ margin-left: 0;
+ border: 0px solid #ccc;
+ }
+ .maplegend .legend-source {
+ color: #777;
+ clear: both;
+ }
+ .maplegend a {
+ color: #777;
+ }
+ </style>
+ {% endmacro %}
+ """
+ import branca as bc
+
+ # Add CSS (on Header)
+ macro = bc.element.MacroElement()
+ macro._template = bc.element.Template(head)
+ m.get_root().add_child(macro)
+
+ body = f"""
+ <div id='maplegend {title}' class='maplegend'>
+ <div class='legend-title'>{title}</div>
+ <div class='legend-scale'>
+ <ul class='legend-labels'>"""
+
+ # Loop Categories
+ for label, color in zip(categories, colors):
+ body += f"""
+ <li><span style='background:{color}'></span>{label}</li>"""
+
+ body += """
+ </ul>
+ </div>
+ </div>
+ """
+
+ # Add Body
+ body = bc.element.Element(body, "legend")
+ m.get_root().html.add_child(body)
-def _explore_geoseries(s, color=None, m=None, tiles='OpenStreetMap', attr=
- None, highlight=True, width='100%', height='100%', control_scale=True,
- marker_type=None, marker_kwds={}, style_kwds={}, highlight_kwds={},
- map_kwds={}, **kwargs):
+def _explore_geoseries(
+ s,
+ color=None,
+ m=None,
+ tiles="OpenStreetMap",
+ attr=None,
+ highlight=True,
+ width="100%",
+ height="100%",
+ control_scale=True,
+ marker_type=None,
+ marker_kwds={},
+ style_kwds={},
+ highlight_kwds={},
+ map_kwds={},
+ **kwargs,
+):
"""Interactive map based on GeoPandas and folium/leaflet.js
Generate an interactive leaflet map based on :class:`~geopandas.GeoSeries`
@@ -356,4 +1019,20 @@ def _explore_geoseries(s, color=None, m=None, tiles='OpenStreetMap', attr=
folium :class:`~folium.folium.Map` instance
"""
- pass
+ return _explore(
+ s,
+ color=color,
+ m=m,
+ tiles=tiles,
+ attr=attr,
+ highlight=highlight,
+ width=width,
+ height=height,
+ control_scale=control_scale,
+ marker_type=marker_type,
+ marker_kwds=marker_kwds,
+ style_kwds=style_kwds,
+ highlight_kwds=highlight_kwds,
+ map_kwds=map_kwds,
+ **kwargs,
+ )
diff --git a/geopandas/geodataframe.py b/geopandas/geodataframe.py
index 9ea3e33f..339315d6 100644
--- a/geopandas/geodataframe.py
+++ b/geopandas/geodataframe.py
@@ -1,18 +1,23 @@
import json
import warnings
+
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
+
import shapely.errors
from shapely.geometry import mapping, shape
from shapely.geometry.base import BaseGeometry
+
import geopandas.io
from geopandas.array import GeometryArray, GeometryDtype, from_shapely, to_wkb, to_wkt
from geopandas.base import GeoPandasBase, is_geometry_type
from geopandas.explore import _explore
from geopandas.geoseries import GeoSeries
+
from ._compat import HAS_PYPROJ, PANDAS_GE_30
from ._decorator import doc
+
if PANDAS_GE_30:
from pandas.core.accessor import Accessor
else:
@@ -25,7 +30,12 @@ def _geodataframe_constructor_with_fallback(*args, **kwargs):
to returning a DataFrame (if a certain operation does not preserve the
geometry column)
"""
- pass
+ df = GeoDataFrame(*args, **kwargs)
+ geometry_cols_mask = df.dtypes == "geometry"
+ if len(geometry_cols_mask) == 0 or geometry_cols_mask.sum() == 0:
+ df = pd.DataFrame(df)
+
+ return df
def _ensure_geometry(data, crs=None):
@@ -37,12 +47,32 @@ def _ensure_geometry(data, crs=None):
If the input is a GeometryDtype with a set CRS, `crs` is ignored.
"""
- pass
+ if is_geometry_type(data):
+ if isinstance(data, Series):
+ data = GeoSeries(data)
+ if data.crs is None and crs is not None:
+ # Avoids caching issues/crs sharing issues
+ data = data.copy()
+ if isinstance(data, GeometryArray):
+ data.crs = crs
+ else:
+ data.array.crs = crs
+ return data
+ else:
+ if isinstance(data, Series):
+ out = from_shapely(np.asarray(data), crs=crs)
+ return GeoSeries(out, index=data.index, name=data.name)
+ else:
+ out = from_shapely(data, crs=crs)
+ return out
crs_mismatch_error = (
- "CRS mismatch between CRS of the passed geometries and 'crs'. Use 'GeoDataFrame.set_crs(crs, allow_override=True)' to overwrite CRS or 'GeoDataFrame.to_crs(crs)' to reproject geometries. "
- )
+ "CRS mismatch between CRS of the passed geometries "
+ "and 'crs'. Use 'GeoDataFrame.set_crs(crs, "
+ "allow_override=True)' to overwrite CRS or "
+ "'GeoDataFrame.to_crs(crs)' to reproject geometries. "
+)
class GeoDataFrame(GeoPandasBase, DataFrame):
@@ -103,57 +133,135 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
GeoSeries : Series object designed to store shapely geometry objects
"""
- _metadata = ['_geometry_column_name']
- _internal_names = DataFrame._internal_names + ['geometry']
+
+ _metadata = ["_geometry_column_name"]
+
+ _internal_names = DataFrame._internal_names + ["geometry"]
_internal_names_set = set(_internal_names)
+
_geometry_column_name = None
def __init__(self, data=None, *args, geometry=None, crs=None, **kwargs):
- if kwargs.get('copy') is None and isinstance(data, DataFrame
- ) and not isinstance(data, GeoDataFrame):
+ if (
+ kwargs.get("copy") is None
+ and isinstance(data, DataFrame)
+ and not isinstance(data, GeoDataFrame)
+ ):
kwargs.update(copy=True)
super().__init__(data, *args, **kwargs)
+
+ # set_geometry ensures the geometry data have the proper dtype,
+ # but is not called if `geometry=None` ('geometry' column present
+ # in the data), so therefore need to ensure it here manually
+ # but within a try/except because currently non-geometries are
+ # allowed in that case
+ # TODO do we want to raise / return normal DataFrame in this case?
+
+ # if gdf passed in and geo_col is set, we use that for geometry
if geometry is None and isinstance(data, GeoDataFrame):
self._geometry_column_name = data._geometry_column_name
if crs is not None and data.crs != crs:
raise ValueError(crs_mismatch_error)
- if (geometry is None and self.columns.nlevels == 1 and 'geometry' in
- self.columns):
- if (self.columns == 'geometry').sum() > 1:
+
+ if (
+ geometry is None
+ and self.columns.nlevels == 1
+ and "geometry" in self.columns
+ ):
+ # Check for multiple columns with name "geometry". If there are,
+ # self["geometry"] is a gdf and constructor gets recursively recalled
+ # by pandas internals trying to access this
+ if (self.columns == "geometry").sum() > 1:
raise ValueError(
- "GeoDataFrame does not support multiple columns using the geometry column name 'geometry'."
- )
+ "GeoDataFrame does not support multiple columns "
+ "using the geometry column name 'geometry'."
+ )
+
+ # only if we have actual geometry values -> call set_geometry
try:
- if hasattr(self['geometry'].values, 'crs') and self['geometry'
- ].values.crs and crs and not self['geometry'
- ].values.crs == crs:
+ if (
+ hasattr(self["geometry"].values, "crs")
+ and self["geometry"].values.crs
+ and crs
+ and not self["geometry"].values.crs == crs
+ ):
raise ValueError(crs_mismatch_error)
- self['geometry'] = _ensure_geometry(self['geometry'].values,
- crs)
+ self["geometry"] = _ensure_geometry(self["geometry"].values, crs)
except TypeError:
pass
else:
- geometry = 'geometry'
+ geometry = "geometry"
+
if geometry is not None:
- if hasattr(geometry, 'crs'
- ) and geometry.crs and crs and not geometry.crs == crs:
+ if (
+ hasattr(geometry, "crs")
+ and geometry.crs
+ and crs
+ and not geometry.crs == crs
+ ):
raise ValueError(crs_mismatch_error)
- if hasattr(geometry, 'name') and geometry.name not in ('geometry',
- None):
- geometry = geometry.rename('geometry')
+
+ if hasattr(geometry, "name") and geometry.name not in ("geometry", None):
+ # __init__ always creates geometry col named "geometry"
+ # rename as `set_geometry` respects the given series name
+ geometry = geometry.rename("geometry")
+
self.set_geometry(geometry, inplace=True, crs=crs)
+
if geometry is None and crs:
raise ValueError(
- "Assigning CRS to a GeoDataFrame without a geometry column is not supported. Supply geometry using the 'geometry=' keyword argument, or by providing a DataFrame with column name 'geometry'"
- )
+ "Assigning CRS to a GeoDataFrame without a geometry column is not "
+ "supported. Supply geometry using the 'geometry=' keyword argument, "
+ "or by providing a DataFrame with column name 'geometry'",
+ )
def __setattr__(self, attr, val):
- if attr == 'geometry':
+ # have to special case geometry b/c pandas tries to use as column...
+ if attr == "geometry":
object.__setattr__(self, attr, val)
else:
super().__setattr__(attr, val)
- geometry = property(fget=_get_geometry, fset=_set_geometry, doc=
- 'Geometry data for GeoDataFrame')
+
+ def _get_geometry(self):
+ if self._geometry_column_name not in self:
+ if self._geometry_column_name is None:
+ msg = (
+ "You are calling a geospatial method on the GeoDataFrame, "
+ "but the active geometry column to use has not been set. "
+ )
+ else:
+ msg = (
+ "You are calling a geospatial method on the GeoDataFrame, "
+ f"but the active geometry column ('{self._geometry_column_name}') "
+ "is not present. "
+ )
+ geo_cols = list(self.columns[self.dtypes == "geometry"])
+ if len(geo_cols) > 0:
+ msg += (
+ f"\nThere are columns with geometry data type ({geo_cols}), and "
+ "you can either set one as the active geometry with "
+ 'df.set_geometry("name") or access the column as a '
+ 'GeoSeries (df["name"]) and call the method directly on it.'
+ )
+ else:
+ msg += (
+ "\nThere are no existing columns with geometry data type. You can "
+ "add a geometry column as the active geometry column with "
+ "df.set_geometry. "
+ )
+
+ raise AttributeError(msg)
+ return self[self._geometry_column_name]
+
+ def _set_geometry(self, col):
+ if not pd.api.types.is_list_like(col):
+ raise ValueError("Must use a list-like to set the geometry property")
+ self._persist_old_default_geometry_colname()
+ self.set_geometry(col, inplace=True)
+
+ geometry = property(
+ fget=_get_geometry, fset=_set_geometry, doc="Geometry data for GeoDataFrame"
+ )
def set_geometry(self, col, drop=None, inplace=False, crs=None):
"""
@@ -222,7 +330,90 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
GeoDataFrame.rename_geometry : rename an active geometry column
"""
- pass
+ # Most of the code here is taken from DataFrame.set_index()
+ if inplace:
+ frame = self
+ else:
+ if PANDAS_GE_30:
+ frame = self.copy(deep=False)
+ else:
+ frame = self.copy()
+
+ geo_column_name = self._geometry_column_name
+
+ if geo_column_name is None:
+ geo_column_name = "geometry"
+ if isinstance(col, (Series, list, np.ndarray, GeometryArray)):
+ if drop:
+ msg = (
+ "The `drop` keyword argument is deprecated and has no effect when "
+ "`col` is an array-like value. You should stop passing `drop` to "
+ "`set_geometry` when this is the case."
+ )
+ warnings.warn(msg, category=FutureWarning, stacklevel=2)
+ if isinstance(col, Series) and col.name is not None:
+ geo_column_name = col.name
+
+ level = col
+ elif hasattr(col, "ndim") and col.ndim > 1:
+ raise ValueError("Must pass array with one dimension only.")
+ else: # should be a colname
+ try:
+ level = frame[col]
+ except KeyError:
+ raise ValueError("Unknown column %s" % col)
+ if isinstance(level, DataFrame):
+ raise ValueError(
+ "GeoDataFrame does not support setting the geometry column where "
+ "the column name is shared by multiple columns."
+ )
+
+ given_colname_drop_msg = (
+ "The `drop` keyword argument is deprecated and in future the only "
+ "supported behaviour will match drop=False. To silence this "
+ "warning and adopt the future behaviour, stop providing "
+ "`drop` as a keyword to `set_geometry`. To replicate the "
+ "`drop=True` behaviour you should update "
+ "your code to\n`geo_col_name = gdf.active_geometry_name;"
+ " gdf.set_geometry(new_geo_col).drop("
+ "columns=geo_col_name).rename_geometry(geo_col_name)`."
+ )
+
+ if drop is False: # specifically False, not falsy i.e. None
+ # User supplied False explicitly, but arg is deprecated
+ warnings.warn(
+ given_colname_drop_msg,
+ category=FutureWarning,
+ stacklevel=2,
+ )
+ if drop:
+ del frame[col]
+ warnings.warn(
+ given_colname_drop_msg,
+ category=FutureWarning,
+ stacklevel=2,
+ )
+ else:
+ # if not dropping, set the active geometry name to the given col name
+ geo_column_name = col
+
+ if not crs:
+ crs = getattr(level, "crs", None)
+
+ # Check that we are using a listlike of geometries
+ level = _ensure_geometry(level, crs=crs)
+ # ensure_geometry only sets crs on level if it has crs==None
+ if isinstance(level, GeoSeries):
+ level.array.crs = crs
+ else:
+ level.crs = crs
+ # update _geometry_column_name prior to assignment
+ # to avoid default is None warning
+ frame._geometry_column_name = geo_column_name
+ frame[geo_column_name] = level
+
+ if not inplace:
+ return frame
def rename_geometry(self, col, inplace=False):
"""
@@ -257,7 +448,16 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
GeoDataFrame.set_geometry : set the active geometry
"""
- pass
+ geometry_col = self.geometry.name
+ if col in self.columns:
+ raise ValueError(f"Column named {col} already exists")
+ else:
+ if not inplace:
+ return self.rename(columns={geometry_col: col}).set_geometry(
+ col, inplace=inplace
+ )
+ self.rename(columns={geometry_col: col}, inplace=inplace)
+ self.set_geometry(col, inplace=inplace)
@property
def active_geometry_name(self):
@@ -277,7 +477,7 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
GeoDataFrame.set_geometry : set the active geometry
"""
- pass
+ return self._geometry_column_name
@property
def crs(self):
@@ -313,28 +513,66 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_crs : re-project to another CRS
"""
- pass
+ try:
+ return self.geometry.crs
+ except AttributeError:
+ raise AttributeError(
+ "The CRS attribute of a GeoDataFrame without an active "
+ "geometry column is not defined. Use GeoDataFrame.set_geometry "
+ "to set the active geometry column."
+ )
@crs.setter
def crs(self, value):
"""Sets the value of the crs"""
- pass
+ if self._geometry_column_name is None:
+ raise ValueError(
+ "Assigning CRS to a GeoDataFrame without a geometry column is not "
+ "supported. Use GeoDataFrame.set_geometry to set the active "
+ "geometry column.",
+ )
+
+ if hasattr(self.geometry.values, "crs"):
+ if self.crs is not None:
+ warnings.warn(
+ "Overriding the CRS of a GeoDataFrame that already has CRS. "
+ "This unsafe behavior will be deprecated in future versions. "
+ "Use GeoDataFrame.set_crs method instead",
+ stacklevel=2,
+ category=DeprecationWarning,
+ )
+ self.geometry.values.crs = value
+ else:
+ # column called 'geometry' without geometry
+ raise ValueError(
+ "Assigning CRS to a GeoDataFrame without an active geometry "
+ "column is not supported. Use GeoDataFrame.set_geometry to set "
+ "the active geometry column.",
+ )
def __setstate__(self, state):
+ # overriding DataFrame method for compat with older pickles (CRS handling)
crs = None
if isinstance(state, dict):
- if 'crs' in state and '_crs' not in state:
- crs = state.pop('crs', None)
+ if "crs" in state and "_crs" not in state:
+ crs = state.pop("crs", None)
else:
- crs = state.pop('_crs', None)
+ crs = state.pop("_crs", None)
if crs is not None and not HAS_PYPROJ:
raise ImportError(
- "Unpickling a GeoDataFrame with CRS requires the 'pyproj' package, but it is not installed or does not import correctly. "
- )
+ "Unpickling a GeoDataFrame with CRS requires the 'pyproj' package, "
+ "but it is not installed or does not import correctly. "
+ )
elif crs is not None:
from pyproj import CRS
+
crs = CRS.from_user_input(crs)
+
super().__setstate__(state)
+
+ # for some versions that didn't yet have CRS at array level -> crs is set
+ # at GeoDataFrame level with '_crs' (and not 'crs'), so without propagating
+ # to the GeoSeries/GeometryArray
try:
if crs is not None:
if self.geometry.values.crs is None:
@@ -365,7 +603,8 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame
"""
- pass
+ dataframe = DataFrame.from_dict(data, **kwargs)
+ return cls(dataframe, geometry=geometry, crs=crs)
@classmethod
def from_file(cls, filename, **kwargs):
@@ -393,12 +632,18 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
>>> path = geodatasets.get_path('nybb')
>>> gdf = geopandas.GeoDataFrame.from_file(path)
>>> gdf # doctest: +SKIP
- BoroCode BoroName Shape_Leng Shape_Area geometry
- 0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON (((970217.022 145643.332, 970227....
- 1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON (((1029606.077 156073.814, 102957...
- 2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON (((1021176.479 151374.797, 102100...
- 3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON (((981219.056 188655.316, 980940....
- 4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON (((1012821.806 229228.265, 101278...
+ BoroCode BoroName Shape_Leng Shape_Area \
+ geometry
+ 0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON ((\
+(970217.022 145643.332, 970227....
+ 1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON ((\
+(1029606.077 156073.814, 102957...
+ 2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON ((\
+(1021176.479 151374.797, 102100...
+ 3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON ((\
+(981219.056 188655.316, 980940....
+ 4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON ((\
+(1012821.806 229228.265, 101278...
The recommended method of reading files is :func:`geopandas.read_file`:
@@ -410,7 +655,7 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_file : write GeoDataFrame to file
"""
- pass
+ return geopandas.io.file._read_file(filename, **kwargs)
@classmethod
def from_features(cls, features, crs=None, columns=None):
@@ -472,12 +717,46 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
1 POINT (2 1) name2
"""
- pass
+ # Handle feature collections
+ if hasattr(features, "__geo_interface__"):
+ fs = features.__geo_interface__
+ else:
+ fs = features
+
+ if isinstance(fs, dict) and fs.get("type") == "FeatureCollection":
+ features_lst = fs["features"]
+ else:
+ features_lst = features
+
+ rows = []
+ for feature in features_lst:
+ # load geometry
+ if hasattr(feature, "__geo_interface__"):
+ feature = feature.__geo_interface__
+ row = {
+ "geometry": shape(feature["geometry"]) if feature["geometry"] else None
+ }
+ # load properties
+ properties = feature["properties"]
+ if properties is None:
+ properties = {}
+ row.update(properties)
+ rows.append(row)
+ return cls(rows, columns=columns, crs=crs)
@classmethod
- def from_postgis(cls, sql, con, geom_col='geom', crs=None, index_col=
- None, coerce_float=True, parse_dates=None, params=None, chunksize=None
- ):
+ def from_postgis(
+ cls,
+ sql,
+ con,
+ geom_col="geom",
+ crs=None,
+ index_col=None,
+ coerce_float=True,
+ parse_dates=None,
+ params=None,
+ chunksize=None,
+ ):
"""
Alternate constructor to create a ``GeoDataFrame`` from a sql query
containing a geometry column in WKB representation.
@@ -534,7 +813,20 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
geopandas.read_postgis : read PostGIS database to GeoDataFrame
"""
- pass
+
+ df = geopandas.io.sql._read_postgis(
+ sql,
+ con,
+ geom_col=geom_col,
+ crs=crs,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ params=params,
+ chunksize=chunksize,
+ )
+
+ return df
@classmethod
def from_arrow(cls, table, geometry=None):
@@ -568,10 +860,13 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame
"""
- pass
+ from geopandas.io._geoarrow import arrow_to_geopandas
+
+ return arrow_to_geopandas(table, geometry=geometry)
- def to_json(self, na='null', show_bbox=False, drop_id=False, to_wgs84=
- False, **kwargs):
+ def to_json(
+ self, na="null", show_bbox=False, drop_id=False, to_wgs84=False, **kwargs
+ ):
"""
Returns a GeoJSON representation of the ``GeoDataFrame`` as a string.
@@ -621,7 +916,11 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
1 name2 POINT (2 1)
>>> gdf.to_json()
- '{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"col1": "name1"}, "geometry": {"type": "Point", "coordinates": [1.0, 2.0]}}, {"id": "1", "type": "Feature", "properties": {"col1": "name2"}, "geometry": {"type": "Point", "coordinates": [2.0, 1.0]}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}}'
+ '{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", \
+"properties": {"col1": "name1"}, "geometry": {"type": "Point", "coordinates": [1.0,\
+ 2.0]}}, {"id": "1", "type": "Feature", "properties": {"col1": "name2"}, "geometry"\
+: {"type": "Point", "coordinates": [2.0, 1.0]}}], "crs": {"type": "name", "properti\
+es": {"name": "urn:ogc:def:crs:EPSG::3857"}}}'
Alternatively, you can write GeoJSON to file:
@@ -632,7 +931,35 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_file : write GeoDataFrame to file
"""
- pass
+ if to_wgs84:
+ if self.crs:
+ df = self.to_crs(epsg=4326)
+ else:
+ raise ValueError(
+ "CRS is not set. Cannot re-project to WGS84 (EPSG:4326)."
+ )
+ else:
+ df = self
+
+ geo = df.to_geo_dict(na=na, show_bbox=show_bbox, drop_id=drop_id)
+
+ # if the geometry is not in WGS84, include CRS in the JSON
+ if df.crs is not None and not df.crs.equals("epsg:4326"):
+ auth_crsdef = self.crs.to_authority()
+ allowed_authorities = ["EDCS", "EPSG", "OGC", "SI", "UCUM"]
+
+ if auth_crsdef is None or auth_crsdef[0] not in allowed_authorities:
+ warnings.warn(
+ "GeoDataFrame's CRS is not representable in URN OGC "
+ "format. Resulting JSON will contain no CRS information.",
+ stacklevel=2,
+ )
+ else:
+ authority, code = auth_crsdef
+ ogc_crs = f"urn:ogc:def:crs:{authority}::{code}"
+ geo["crs"] = {"type": "name", "properties": {"name": ogc_crs}}
+
+ return json.dumps(geo, **kwargs)
@property
def __geo_interface__(self):
@@ -660,11 +987,15 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
1 name2 POINT (2 1)
>>> gdf.__geo_interface__
- {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', 'properties': {'col1': 'name1'}, 'geometry': {'type': 'Point', 'coordinates': (1.0, 2.0)}, 'bbox': (1.0, 2.0, 1.0, 2.0)}, {'id': '1', 'type': 'Feature', 'properties': {'col1': 'name2'}, 'geometry': {'type': 'Point', 'coordinates': (2.0, 1.0)}, 'bbox': (2.0, 1.0, 2.0, 1.0)}], 'bbox': (1.0, 1.0, 2.0, 2.0)}
+ {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', \
+'properties': {'col1': 'name1'}, 'geometry': {'type': 'Point', 'coordinates': (1.0\
+, 2.0)}, 'bbox': (1.0, 2.0, 1.0, 2.0)}, {'id': '1', 'type': 'Feature', 'properties\
+': {'col1': 'name2'}, 'geometry': {'type': 'Point', 'coordinates': (2.0, 1.0)}, 'b\
+box': (2.0, 1.0, 2.0, 1.0)}], 'bbox': (1.0, 1.0, 2.0, 2.0)}
"""
- return self.to_geo_dict(na='null', show_bbox=True, drop_id=False)
+ return self.to_geo_dict(na="null", show_bbox=True, drop_id=False)
- def iterfeatures(self, na='null', show_bbox=False, drop_id=False):
+ def iterfeatures(self, na="null", show_bbox=False, drop_id=False):
"""
Returns an iterator that yields feature dictionaries that comply with
__geo_interface__
@@ -676,7 +1007,8 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
Indicates how to output missing (NaN) values in the GeoDataFrame
- null: output the missing entries as JSON null
- - drop: remove the property from the feature. This applies to each feature individually so that features may have different properties
+ - drop: remove the property from the feature. This applies to each feature \
+individually so that features may have different properties
- keep: output the missing entries as NaN
show_bbox : bool, optional
@@ -699,11 +1031,79 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
>>> feature = next(gdf.iterfeatures())
>>> feature
- {'id': '0', 'type': 'Feature', 'properties': {'col1': 'name1'}, 'geometry': {'type': 'Point', 'coordinates': (1.0, 2.0)}}
+ {'id': '0', 'type': 'Feature', 'properties': {'col1': 'name1'}, 'geometry': {\
+'type': 'Point', 'coordinates': (1.0, 2.0)}}
"""
- pass
+ if na not in ["null", "drop", "keep"]:
+ raise ValueError("Unknown na method {0}".format(na))
+
+ if self._geometry_column_name not in self:
+ raise AttributeError(
+ "No geometry data set (expected in column '%s')."
+ % self._geometry_column_name
+ )
+
+ ids = np.array(self.index, copy=False)
+ geometries = np.array(self[self._geometry_column_name], copy=False)
+
+ if not self.columns.is_unique:
+ raise ValueError("GeoDataFrame cannot contain duplicated column names.")
+
+ properties_cols = self.columns.drop(self._geometry_column_name)
+
+ if len(properties_cols) > 0:
+ # convert to object to get python scalars.
+ properties_cols = self[properties_cols]
+ properties = properties_cols.astype(object)
+ na_mask = pd.isna(properties_cols).values
+
+ if na == "null":
+ properties[na_mask] = None
+
+ for i, row in enumerate(properties.values):
+ geom = geometries[i]
+
+ if na == "drop":
+ na_mask_row = na_mask[i]
+ properties_items = {
+ k: v
+ for k, v, na in zip(properties_cols, row, na_mask_row)
+ if not na
+ }
+ else:
+ properties_items = dict(zip(properties_cols, row))
+
+ if drop_id:
+ feature = {}
+ else:
+ feature = {"id": str(ids[i])}
+
+ feature["type"] = "Feature"
+ feature["properties"] = properties_items
+ feature["geometry"] = mapping(geom) if geom else None
+
+ if show_bbox:
+ feature["bbox"] = geom.bounds if geom else None
+
+ yield feature
+
+ else:
+ for fid, geom in zip(ids, geometries):
+ if drop_id:
+ feature = {}
+ else:
+ feature = {"id": str(fid)}
+
+ feature["type"] = "Feature"
+ feature["properties"] = {}
+ feature["geometry"] = mapping(geom) if geom else None
+
+ if show_bbox:
+ feature["bbox"] = geom.bounds if geom else None
+
+ yield feature
- def to_geo_dict(self, na='null', show_bbox=False, drop_id=False):
+ def to_geo_dict(self, na="null", show_bbox=False, drop_id=False):
"""
Returns a python feature collection representation of the GeoDataFrame
as a dictionary with a list of features based on the ``__geo_interface__``
@@ -716,7 +1116,8 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
Indicates how to output missing (NaN) values in the GeoDataFrame
- null: output the missing entries as JSON null
- - drop: remove the property from the feature. This applies to each feature individually so that features may have different properties
+ - drop: remove the property from the feature. This applies to each feature \
+individually so that features may have different properties
- keep: output the missing entries as NaN
show_bbox : bool, optional
@@ -738,14 +1139,27 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
1 name2 POINT (2 1)
>>> gdf.to_geo_dict()
- {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', 'properties': {'col1': 'name1'}, 'geometry': {'type': 'Point', 'coordinates': (1.0, 2.0)}}, {'id': '1', 'type': 'Feature', 'properties': {'col1': 'name2'}, 'geometry': {'type': 'Point', 'coordinates': (2.0, 1.0)}}]}
+ {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', '\
+properties': {'col1': 'name1'}, 'geometry': {'type': 'Point', 'coordinates': (1.0, \
+2.0)}}, {'id': '1', 'type': 'Feature', 'properties': {'col1': 'name2'}, 'geometry':\
+ {'type': 'Point', 'coordinates': (2.0, 1.0)}}]}
See also
--------
GeoDataFrame.to_json : return a GeoDataFrame as a GeoJSON string
"""
- pass
+ geo = {
+ "type": "FeatureCollection",
+ "features": list(
+ self.iterfeatures(na=na, show_bbox=show_bbox, drop_id=drop_id)
+ ),
+ }
+
+ if show_bbox:
+ geo["bbox"] = tuple(self.total_bounds)
+
+ return geo
def to_wkb(self, hex=False, **kwargs):
"""
@@ -765,7 +1179,14 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
DataFrame
geometry columns are encoded to WKB
"""
- pass
+
+ df = DataFrame(self.copy())
+
+ # Encode all geometry columns to WKB
+ for col in df.columns[df.dtypes == "geometry"]:
+ df[col] = to_wkb(df[col].values, hex=hex, **kwargs)
+
+ return df
def to_wkt(self, **kwargs):
"""
@@ -781,10 +1202,18 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
DataFrame
geometry columns are encoded to WKT
"""
- pass
- def to_arrow(self, *, index=None, geometry_encoding='WKB', interleaved=
- True, include_z=None):
+ df = DataFrame(self.copy())
+
+ # Encode all geometry columns to WKT
+ for col in df.columns[df.dtypes == "geometry"]:
+ df[col] = to_wkt(df[col].values, **kwargs)
+
+ return df
+
+ def to_arrow(
+ self, *, index=None, geometry_encoding="WKB", interleaved=True, include_z=None
+ ):
"""Encode a GeoDataFrame to GeoArrow format.
See https://geoarrow.org/ for details on the GeoArrow specification.
@@ -855,14 +1284,31 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
geometry: binary
----
col1: [["name1","name2"]]
- geometry: [[0101000000000000000000F03F0000000000000040,01010000000000000000000040000000000000F03F]]
+ geometry: [[0101000000000000000000F03F0000000000000040,\
+01010000000000000000000040000000000000F03F]]
"""
- pass
-
- def to_parquet(self, path, index=None, compression='snappy',
- geometry_encoding='WKB', write_covering_bbox=False, schema_version=
- None, **kwargs):
+ from geopandas.io._geoarrow import ArrowTable, geopandas_to_arrow
+
+ table, _ = geopandas_to_arrow(
+ self,
+ index=index,
+ geometry_encoding=geometry_encoding,
+ interleaved=interleaved,
+ include_z=include_z,
+ )
+ return ArrowTable(table)
+
+ def to_parquet(
+ self,
+ path,
+ index=None,
+ compression="snappy",
+ geometry_encoding="WKB",
+ write_covering_bbox=False,
+ schema_version=None,
+ **kwargs,
+ ):
"""Write a GeoDataFrame to the Parquet format.
By default, all geometry columns present are serialized to WKB format
@@ -915,10 +1361,33 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_feather : write GeoDataFrame to feather
GeoDataFrame.to_file : write GeoDataFrame to file
"""
- pass
- def to_feather(self, path, index=None, compression=None, schema_version
- =None, **kwargs):
+ # Accept engine keyword for compatibility with pandas.DataFrame.to_parquet
+ # The only engine currently supported by GeoPandas is pyarrow, so no
+ # other engine should be specified.
+ engine = kwargs.pop("engine", "auto")
+ if engine not in ("auto", "pyarrow"):
+ raise ValueError(
+ "GeoPandas only supports using pyarrow as the engine for "
+ f"to_parquet: {engine!r} passed instead."
+ )
+
+ from geopandas.io.arrow import _to_parquet
+
+ _to_parquet(
+ self,
+ path,
+ compression=compression,
+ geometry_encoding=geometry_encoding,
+ index=index,
+ schema_version=schema_version,
+ write_covering_bbox=write_covering_bbox,
+ **kwargs,
+ )
+
+ def to_feather(
+ self, path, index=None, compression=None, schema_version=None, **kwargs
+ ):
"""Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
@@ -956,10 +1425,19 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_parquet : write GeoDataFrame to parquet
GeoDataFrame.to_file : write GeoDataFrame to file
"""
- pass
- def to_file(self, filename, driver=None, schema=None, index=None, **kwargs
- ):
+ from geopandas.io.arrow import _to_feather
+
+ _to_feather(
+ self,
+ path,
+ index=index,
+ compression=compression,
+ schema_version=schema_version,
+ **kwargs,
+ )
+
+ def to_file(self, filename, driver=None, schema=None, index=None, **kwargs):
"""Write the ``GeoDataFrame`` to a file.
By default, an ESRI shapefile is written, but any OGR data source
@@ -1053,10 +1531,11 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
... ) # doctest: +SKIP
"""
- pass
+ from geopandas.io.file import _to_file
- def set_crs(self, crs=None, epsg=None, inplace=False, allow_override=False
- ):
+ _to_file(self, filename, driver, schema, index, **kwargs)
+
+ def set_crs(self, crs=None, epsg=None, inplace=False, allow_override=False):
"""
Set the Coordinate Reference System (CRS) of the ``GeoDataFrame``.
@@ -1130,7 +1609,14 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
GeoDataFrame.to_crs : re-project to another CRS
"""
- pass
+ if not inplace:
+ df = self.copy()
+ else:
+ df = self
+ df.geometry = df.geometry.set_crs(
+ crs=crs, epsg=epsg, allow_override=allow_override, inplace=True
+ )
+ return df
def to_crs(self, crs=None, epsg=None, inplace=False):
"""Transform geometries to a new coordinate reference system.
@@ -1208,9 +1694,16 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
--------
GeoDataFrame.set_crs : assign CRS without re-projection
"""
- pass
+ if inplace:
+ df = self
+ else:
+ df = self.copy()
+ geom = df.geometry.to_crs(crs=crs, epsg=epsg)
+ df.geometry = geom
+ if not inplace:
+ return df
- def estimate_utm_crs(self, datum_name='WGS 84'):
+ def estimate_utm_crs(self, datum_name="WGS 84"):
"""Returns the estimated UTM CRS based on the bounds of the dataset.
.. versionadded:: 0.9
@@ -1246,7 +1739,7 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
"""
- pass
+ return self.geometry.estimate_utm_crs(datum_name=datum_name)
def __getitem__(self, key):
"""
@@ -1255,17 +1748,23 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
return a GeoDataFrame.
"""
result = super().__getitem__(key)
- if pd.api.types.is_scalar(key) and key == '' and isinstance(self.
- columns, pd.MultiIndex) and isinstance(result, Series
- ) and not is_geometry_type(result):
+ # Custom logic to avoid waiting for pandas GH51895
+ # result is not geometry dtype for multi-indexes
+ if (
+ pd.api.types.is_scalar(key)
+ and key == ""
+ and isinstance(self.columns, pd.MultiIndex)
+ and isinstance(result, Series)
+ and not is_geometry_type(result)
+ ):
loc = self.columns.get_loc(key)
- result = self.iloc[:, loc].squeeze(axis='columns')
+ # squeeze stops multilevel columns from returning a gdf
+ result = self.iloc[:, loc].squeeze(axis="columns")
geo_col = self._geometry_column_name
- if isinstance(result, Series) and isinstance(result.dtype,
- GeometryDtype):
+ if isinstance(result, Series) and isinstance(result.dtype, GeometryDtype):
result.__class__ = GeoSeries
elif isinstance(result, DataFrame):
- if (result.dtypes == 'geometry').sum() > 0:
+ if (result.dtypes == "geometry").sum() > 0:
result.__class__ = GeoDataFrame
if geo_col in result:
result._geometry_column_name = geo_col
@@ -1276,54 +1775,182 @@ class GeoDataFrame(GeoPandasBase, DataFrame):
def _persist_old_default_geometry_colname(self):
"""Internal util to temporarily persist the default geometry column
name of 'geometry' for backwards compatibility."""
- pass
+ # self.columns check required to avoid this warning in __init__
+ if self._geometry_column_name is None and "geometry" not in self.columns:
+ msg = (
+ "You are adding a column named 'geometry' to a GeoDataFrame "
+ "constructed without an active geometry column. Currently, "
+ "this automatically sets the active geometry column to 'geometry' "
+ "but in the future that will no longer happen. Instead, either "
+ "provide geometry to the GeoDataFrame constructor "
+ "(GeoDataFrame(... geometry=GeoSeries()) or use "
+ "`set_geometry('geometry')` "
+ "to explicitly set the active geometry column."
+ )
+ warnings.warn(msg, category=FutureWarning, stacklevel=3)
+ self._geometry_column_name = "geometry"
def __setitem__(self, key, value):
"""
Overwritten to preserve CRS of GeometryArray in cases like
df['geometry'] = [geom... for geom in df.geometry]
"""
- if not pd.api.types.is_list_like(key) and (key == self.
- _geometry_column_name or key == 'geometry' and self.
- _geometry_column_name is None):
- if pd.api.types.is_scalar(value) or isinstance(value, BaseGeometry
- ):
+
+ if not pd.api.types.is_list_like(key) and (
+ key == self._geometry_column_name
+ or key == "geometry"
+ and self._geometry_column_name is None
+ ):
+ if pd.api.types.is_scalar(value) or isinstance(value, BaseGeometry):
value = [value] * self.shape[0]
try:
if self._geometry_column_name is not None:
- crs = getattr(self, 'crs', None)
- else:
+ crs = getattr(self, "crs", None)
+ else: # don't use getattr, because a col "crs" might exist
crs = None
value = _ensure_geometry(value, crs=crs)
- if key == 'geometry':
+ if key == "geometry":
self._persist_old_default_geometry_colname()
except TypeError:
- warnings.warn('Geometry column does not contain geometry.',
- stacklevel=2)
+ warnings.warn(
+ "Geometry column does not contain geometry.",
+ stacklevel=2,
+ )
super().__setitem__(key, value)
+ #
+ # Implement pandas methods
+ #
+ @doc(pd.DataFrame)
+ def copy(self, deep=True):
+ copied = super().copy(deep=deep)
+ if type(copied) is pd.DataFrame:
+ copied.__class__ = GeoDataFrame
+ copied._geometry_column_name = self._geometry_column_name
+ return copied
+
+ @doc(pd.DataFrame)
+ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwargs):
+ result = super().apply(
+ func, axis=axis, raw=raw, result_type=result_type, args=args, **kwargs
+ )
+ # Reconstruct gdf if it was lost by apply
+ if (
+ isinstance(result, DataFrame)
+ and self._geometry_column_name in result.columns
+ ):
+ # axis=1 apply will split GeometryDType to object, try and cast back
+ try:
+ result = result.set_geometry(self._geometry_column_name)
+ except TypeError:
+ pass
+ else:
+ if self.crs is not None and result.crs is None:
+ result.set_crs(self.crs, inplace=True)
+ elif isinstance(result, Series) and result.dtype == "object":
+ # Try reconstruct series GeometryDtype if lost by apply
+ # If all none and object dtype assert list of nones is more likely
+ # intended than list of null geometry.
+ if not result.isna().all():
+ try:
+ # not enough info about func to preserve CRS
+ result = _ensure_geometry(result)
+
+ except (TypeError, shapely.errors.GeometryTypeError):
+ pass
+
+ return result
+
+ @property
+ def _constructor(self):
+ return _geodataframe_constructor_with_fallback
+
+ def _constructor_from_mgr(self, mgr, axes):
+ # replicate _geodataframe_constructor_with_fallback behaviour
+ # unless safe to skip
+ if not any(isinstance(block.dtype, GeometryDtype) for block in mgr.blocks):
+ return _geodataframe_constructor_with_fallback(
+ pd.DataFrame._from_mgr(mgr, axes)
+ )
+ gdf = GeoDataFrame._from_mgr(mgr, axes)
+ # _from_mgr doesn't preserve metadata (expect __finalize__ to be called)
+ # still need to mimic __init__ behaviour with geometry=None
+ if (gdf.columns == "geometry").sum() == 1: # only if "geometry" is single col
+ gdf._geometry_column_name = "geometry"
+ return gdf
+
+ @property
+ def _constructor_sliced(self):
+ def _geodataframe_constructor_sliced(*args, **kwargs):
+ """
+ A specialized (Geo)Series constructor which can fall back to a
+ Series if a certain operation does not produce geometries:
+
+ - We only return a GeoSeries if the data is actually of geometry
+ dtype (and so we don't try to convert geometry objects such as
+ the normal GeoSeries(..) constructor does with `_ensure_geometry`).
+ - When we get here from obtaining a row or column from a
+ GeoDataFrame, the goal is to only return a GeoSeries for a
+ geometry column, and not return a GeoSeries for a row that happened
+ to come from a DataFrame with only geometry dtype columns (and
+ thus could have a geometry dtype). Therefore, we don't return a
+ GeoSeries if we are sure we are in a row selection case (by
+ checking the identity of the index)
+ """
+ srs = pd.Series(*args, **kwargs)
+ is_row_proxy = srs.index.is_(self.columns)
+ if is_geometry_type(srs) and not is_row_proxy:
+ srs = GeoSeries(srs)
+ return srs
+
+ return _geodataframe_constructor_sliced
+
+ def _constructor_sliced_from_mgr(self, mgr, axes):
+ is_row_proxy = mgr.index.is_(self.columns)
+
+ if isinstance(mgr.blocks[0].dtype, GeometryDtype) and not is_row_proxy:
+ return GeoSeries._from_mgr(mgr, axes)
+ return Series._from_mgr(mgr, axes)
+
def __finalize__(self, other, method=None, **kwargs):
"""propagate metadata from other to self"""
self = super().__finalize__(other, method=method, **kwargs)
- if method == 'merge':
+
+ # merge operation: using metadata of the left object
+ if method == "merge":
for name in self._metadata:
object.__setattr__(self, name, getattr(other.left, name, None))
- elif method == 'concat':
+ # concat operation: using metadata of the first object
+ elif method == "concat":
for name in self._metadata:
- object.__setattr__(self, name, getattr(other.objs[0], name,
- None))
+ object.__setattr__(self, name, getattr(other.objs[0], name, None))
+
if (self.columns == self._geometry_column_name).sum() > 1:
raise ValueError(
- f"""Concat operation has resulted in multiple columns using the geometry column name '{self._geometry_column_name}'.
-Please ensure this column from the first DataFrame is not repeated."""
- )
- elif method == 'unstack':
+ "Concat operation has resulted in multiple columns using "
+ f"the geometry column name '{self._geometry_column_name}'.\n"
+ "Please ensure this column from the first DataFrame is not "
+ "repeated."
+ )
+ elif method == "unstack":
+ # unstack adds multiindex columns and reshapes data.
+ # it never makes sense to retain geometry column
self._geometry_column_name = None
self._crs = None
return self
- def dissolve(self, by=None, aggfunc='first', as_index=True, level=None,
- sort=True, observed=False, dropna=True, method='unary', **kwargs):
+ def dissolve(
+ self,
+ by=None,
+ aggfunc="first",
+ as_index=True,
+ level=None,
+ sort=True,
+ observed=False,
+ dropna=True,
+ method="unary",
+ **kwargs,
+ ):
"""
Dissolve geometries within `groupby` into single observation.
This is accomplished by applying the `union_all` method
@@ -1412,10 +2039,64 @@ Please ensure this column from the first DataFrame is not repeated."""
GeoDataFrame.explode : explode multi-part geometries into single geometries
"""
- pass
- def explode(self, column=None, ignore_index=False, index_parts=False,
- **kwargs):
+ if by is None and level is None:
+ by = np.zeros(len(self), dtype="int64")
+
+ groupby_kwargs = {
+ "by": by,
+ "level": level,
+ "sort": sort,
+ "observed": observed,
+ "dropna": dropna,
+ }
+
+ # Process non-spatial component
+ data = self.drop(labels=self.geometry.name, axis=1)
+ with warnings.catch_warnings(record=True) as record:
+ aggregated_data = data.groupby(**groupby_kwargs).agg(aggfunc, **kwargs)
+ for w in record:
+ if str(w.message).startswith("The default value of numeric_only"):
+ msg = (
+ f"The default value of numeric_only in aggfunc='{aggfunc}' "
+ "within pandas.DataFrameGroupBy.agg used in dissolve is "
+ "deprecated. In pandas 2.0, numeric_only will default to False. "
+ "Either specify numeric_only as additional argument in dissolve() "
+ "or select only columns which should be valid for the function."
+ )
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ else:
+ # Only want to capture specific warning,
+ # other warnings from pandas should be passed through
+ # TODO this is not an ideal approach
+ warnings.showwarning(
+ w.message, w.category, w.filename, w.lineno, w.file, w.line
+ )
+
+ aggregated_data.columns = aggregated_data.columns.to_flat_index()
+
+ # Process spatial component
+ def merge_geometries(block):
+ merged_geom = block.union_all(method=method)
+ return merged_geom
+
+ g = self.groupby(group_keys=False, **groupby_kwargs)[self.geometry.name].agg(
+ merge_geometries
+ )
+
+ # Aggregate
+ aggregated_geometry = GeoDataFrame(g, geometry=self.geometry.name, crs=self.crs)
+ # Recombine
+ aggregated = aggregated_geometry.join(aggregated_data)
+
+ # Reset if requested
+ if not as_index:
+ aggregated = aggregated.reset_index()
+
+ return aggregated
+
+ # overrides the pandas native explode method to break up features geometrically
+ def explode(self, column=None, ignore_index=False, index_parts=False, **kwargs):
"""
Explode multi-part geometries into multiple single geometries.
@@ -1490,9 +2171,39 @@ Please ensure this column from the first DataFrame is not repeated."""
GeoDataFrame.dissolve : dissolve geometries into a single observation.
"""
- pass
- def astype(self, dtype, copy=None, errors='raise', **kwargs):
+ # If no column is specified then default to the active geometry column
+ if column is None:
+ column = self.geometry.name
+ # If the specified column is not a geometry dtype use pandas explode
+ if not isinstance(self[column].dtype, GeometryDtype):
+ return super().explode(column, ignore_index=ignore_index, **kwargs)
+
+ exploded_geom = self.geometry.reset_index(drop=True).explode(index_parts=True)
+
+ df = self.drop(self._geometry_column_name, axis=1).take(
+ exploded_geom.index.droplevel(-1)
+ )
+ df[exploded_geom.name] = exploded_geom.values
+ df = df.set_geometry(self._geometry_column_name).__finalize__(self)
+
+ if ignore_index:
+ df.reset_index(inplace=True, drop=True)
+ elif index_parts:
+ # reset to MultiIndex, otherwise df index is only first level of
+ # exploded GeoSeries index.
+ df = df.set_index(
+ exploded_geom.index.droplevel(
+ list(range(exploded_geom.index.nlevels - 1))
+ ),
+ append=True,
+ )
+
+ return df
+
+ # overrides the pandas astype method to ensure the correct return type
+ # should be removable when pandas 1.4 is dropped
+ def astype(self, dtype, copy=None, errors="raise", **kwargs):
"""
Cast a pandas object to a specified dtype ``dtype``.
Returns a GeoDataFrame when the geometry column is kept as geometries,
@@ -1502,10 +2213,34 @@ Please ensure this column from the first DataFrame is not repeated."""
-------
GeoDataFrame or DataFrame
"""
- pass
+ if not PANDAS_GE_30 and copy is None:
+ copy = True
+ if copy is not None:
+ kwargs["copy"] = copy
- def to_postgis(self, name, con, schema=None, if_exists='fail', index=
- False, index_label=None, chunksize=None, dtype=None):
+ df = super().astype(dtype, errors=errors, **kwargs)
+
+ try:
+ geoms = df[self._geometry_column_name]
+ if is_geometry_type(geoms):
+ return geopandas.GeoDataFrame(df, geometry=self._geometry_column_name)
+ except KeyError:
+ pass
+ # if the geometry column is converted to non-geometries or did not exist
+ # do not return a GeoDataFrame
+ return pd.DataFrame(df)
+
+ def to_postgis(
+ self,
+ name,
+ con,
+ schema=None,
+ if_exists="fail",
+ index=False,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+ ):
"""
Upload GeoDataFrame into PostGIS database.
@@ -1549,7 +2284,8 @@ Please ensure this column from the first DataFrame is not repeated."""
--------
>>> from sqlalchemy import create_engine
- >>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432/mydatabase") # doctest: +SKIP
+ >>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
+/mydatabase") # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
See also
@@ -1558,8 +2294,15 @@ Please ensure this column from the first DataFrame is not repeated."""
read_postgis : read PostGIS database to GeoDataFrame
"""
- pass
- plot = Accessor('plot', geopandas.plotting.GeoplotAccessor)
+ geopandas.io.sql._write_postgis(
+ self, name, con, schema, if_exists, index, index_label, chunksize, dtype
+ )
+
+ plot = Accessor("plot", geopandas.plotting.GeoplotAccessor)
+
+ @doc(_explore)
+ def explore(self, *args, **kwargs):
+ return _explore(self, *args, **kwargs)
def sjoin(self, df, *args, **kwargs):
"""Spatial join of two GeoDataFrames.
@@ -1645,10 +2388,18 @@ Please ensure this column from the first DataFrame is not repeated."""
GeoDataFrame.sjoin_nearest : nearest neighbor join
sjoin : equivalent top-level function
"""
- pass
-
- def sjoin_nearest(self, right, how='inner', max_distance=None, lsuffix=
- 'left', rsuffix='right', distance_col=None, exclusive=False):
+ return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs) # noqa: B026
+
+ def sjoin_nearest(
+ self,
+ right,
+ how="inner",
+ max_distance=None,
+ lsuffix="left",
+ rsuffix="right",
+ distance_col=None,
+ exclusive=False,
+ ):
"""
Spatial join of two GeoDataFrames based on the distance between their
geometries.
@@ -1727,8 +2478,10 @@ Please ensure this column from the first DataFrame is not repeated."""
To include the distances:
- >>> groceries_w_communities = groceries.sjoin_nearest(chicago, distance_col="distances")
- >>> groceries_w_communities[["Chain", "community", "distances"]].head(2)
+ >>> groceries_w_communities = groceries.sjoin_nearest(chicago, \
+distance_col="distances")
+ >>> groceries_w_communities[["Chain", "community", \
+"distances"]].head(2)
Chain community distances
0 VIET HOA PLAZA UPTOWN 0.0
1 COUNTY FAIR FOODS MORGAN PARK 0.0
@@ -1737,8 +2490,10 @@ Please ensure this column from the first DataFrame is not repeated."""
results are equidistant (in this case zero because they intersect).
In fact, we get 4 results in total:
- >>> chicago_w_groceries = groceries.sjoin_nearest(chicago, distance_col="distances", how="right")
- >>> uptown_results = chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
+ >>> chicago_w_groceries = groceries.sjoin_nearest(chicago, \
+distance_col="distances", how="right")
+ >>> uptown_results = \
+chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
>>> uptown_results[["Chain", "community"]]
Chain community
30 VIET HOA PLAZA UPTOWN
@@ -1759,7 +2514,16 @@ Please ensure this column from the first DataFrame is not repeated."""
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+ return geopandas.sjoin_nearest(
+ self,
+ right,
+ how=how,
+ max_distance=max_distance,
+ lsuffix=lsuffix,
+ rsuffix=rsuffix,
+ distance_col=distance_col,
+ exclusive=exclusive,
+ )
def clip(self, mask, keep_geom_type=False, sort=False):
"""Clip points, lines, or polygon geometries to the mask extent.
@@ -1818,10 +2582,9 @@ Please ensure this column from the first DataFrame is not repeated."""
>>> nws_groceries.shape
(7, 8)
"""
- pass
+ return geopandas.clip(self, mask=mask, keep_geom_type=keep_geom_type, sort=sort)
- def overlay(self, right, how='intersection', keep_geom_type=None,
- make_valid=True):
+ def overlay(self, right, how="intersection", keep_geom_type=None, make_valid=True):
"""Perform spatial overlay between GeoDataFrames.
Currently only supports data GeoDataFrames with uniform geometry types,
@@ -1909,7 +2672,19 @@ Please ensure this column from the first DataFrame is not repeated."""
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+ return geopandas.overlay(
+ self, right, how=how, keep_geom_type=keep_geom_type, make_valid=make_valid
+ )
+
+
+def _dataframe_set_geometry(self, col, drop=None, inplace=False, crs=None):
+ if inplace:
+ raise ValueError(
+ "Can't do inplace setting when converting from DataFrame to GeoDataFrame"
+ )
+ gf = GeoDataFrame(self)
+ # this will copy so that BlockManager gets copied
+ return gf.set_geometry(col, drop=drop, inplace=False, crs=crs)
DataFrame.set_geometry = _dataframe_set_geometry
diff --git a/geopandas/geoseries.py b/geopandas/geoseries.py
index 38c4ecf5..ea30f61d 100644
--- a/geopandas/geoseries.py
+++ b/geopandas/geoseries.py
@@ -1,40 +1,79 @@
from __future__ import annotations
+
import typing
import warnings
from packaging.version import Version
from typing import Any, Callable, Dict, Optional
+
import numpy as np
import pandas as pd
from pandas import Series
from pandas.core.internals import SingleBlockManager
+
import shapely
from shapely.geometry import GeometryCollection
from shapely.geometry.base import BaseGeometry
+
import geopandas
from geopandas.base import GeoPandasBase, _delegate_property
from geopandas.explore import _explore_geoseries
from geopandas.plotting import plot_series
+
from . import _compat as compat
from ._decorator import doc
-from .array import GeometryDtype, from_shapely, from_wkb, from_wkt, points_from_xy, to_wkb, to_wkt
+from .array import (
+ GeometryDtype,
+ from_shapely,
+ from_wkb,
+ from_wkt,
+ points_from_xy,
+ to_wkb,
+ to_wkt,
+)
from .base import is_geometry_type
+
if typing.TYPE_CHECKING:
import os
-def _geoseries_constructor_with_fallback(data=None, index=None, crs:
- Optional[Any]=None, **kwargs):
+def _geoseries_constructor_with_fallback(
+ data=None, index=None, crs: Optional[Any] = None, **kwargs
+):
"""
A flexible constructor for GeoSeries._constructor, which needs to be able
to fall back to a Series (if a certain operation does not produce
geometries)
"""
- pass
+ try:
+ return GeoSeries(data=data, index=index, crs=crs, **kwargs)
+ except TypeError:
+ return Series(data=data, index=index, **kwargs)
def _expanddim_logic(df):
"""Shared logic for _constructor_expanddim and _constructor_from_mgr_expanddim."""
- pass
+ from geopandas import GeoDataFrame
+
+ if (df.dtypes == "geometry").sum() > 0:
+ if df.shape[1] == 1:
+ geo_col_name = df.columns[0]
+ else:
+ geo_col_name = None
+
+ if geo_col_name is None or not is_geometry_type(df[geo_col_name]):
+ df = GeoDataFrame(df)
+ df._geometry_column_name = None
+ else:
+ df = df.set_geometry(geo_col_name)
+
+ return df
+
+
+def _geoseries_expanddim(data=None, *args, **kwargs):
+ # pd.Series._constructor_expanddim == pd.DataFrame, we start
+ # with that then specialize.
+ df = pd.DataFrame(data, *args, **kwargs)
+ return _expanddim_logic(df)
class GeoSeries(GeoPandasBase, Series):
@@ -115,60 +154,106 @@ class GeoSeries(GeoPandasBase, Series):
"""
- def __init__(self, data=None, index=None, crs: Optional[Any]=None, **kwargs
- ):
- if (hasattr(data, 'crs') or isinstance(data, pd.Series) and hasattr
- (data.array, 'crs')) and crs:
- data_crs = data.crs if hasattr(data, 'crs') else data.array.crs
+ def __init__(self, data=None, index=None, crs: Optional[Any] = None, **kwargs):
+ if (
+ hasattr(data, "crs")
+ or (isinstance(data, pd.Series) and hasattr(data.array, "crs"))
+ ) and crs:
+ data_crs = data.crs if hasattr(data, "crs") else data.array.crs
if not data_crs:
+ # make a copy to avoid setting CRS to passed GeometryArray
data = data.copy()
- elif not data.crs == crs:
- raise ValueError(
- "CRS mismatch between CRS of the passed geometries and 'crs'. Use 'GeoSeries.set_crs(crs, allow_override=True)' to overwrite CRS or 'GeoSeries.to_crs(crs)' to reproject geometries. "
+ else:
+ if not data.crs == crs:
+ raise ValueError(
+ "CRS mismatch between CRS of the passed geometries "
+ "and 'crs'. Use 'GeoSeries.set_crs(crs, "
+ "allow_override=True)' to overwrite CRS or "
+ "'GeoSeries.to_crs(crs)' to reproject geometries. "
)
+
if isinstance(data, SingleBlockManager):
if not isinstance(data.blocks[0].dtype, GeometryDtype):
raise TypeError(
- f"Non geometry data passed to GeoSeries constructor, received data of dtype '{data.blocks[0].dtype}'"
- )
+ "Non geometry data passed to GeoSeries constructor, "
+ f"received data of dtype '{data.blocks[0].dtype}'"
+ )
+
if isinstance(data, BaseGeometry):
+ # fix problem for scalar geometries passed, ensure the list of
+ # scalars is of correct length if index is specified
n = len(index) if index is not None else 1
data = [data] * n
- name = kwargs.pop('name', None)
+
+ name = kwargs.pop("name", None)
+
if not is_geometry_type(data):
- kwargs.pop('dtype', None)
+ # if data is None and dtype is specified (eg from empty overlay
+ # test), specifying dtype raises an error:
+ # https://github.com/pandas-dev/pandas/issues/26469
+ kwargs.pop("dtype", None)
+ # Use Series constructor to handle input data
with warnings.catch_warnings():
- empty_msg = 'The default dtype for empty Series'
- warnings.filterwarnings('ignore', empty_msg, DeprecationWarning
- )
- warnings.filterwarnings('ignore', empty_msg, FutureWarning)
+ # suppress additional warning from pandas for empty data
+ # (will always give object dtype instead of float dtype in the future,
+ # making the `if s.empty: s = s.astype(object)` below unnecessary)
+ empty_msg = "The default dtype for empty Series"
+ warnings.filterwarnings("ignore", empty_msg, DeprecationWarning)
+ warnings.filterwarnings("ignore", empty_msg, FutureWarning)
s = pd.Series(data, index=index, name=name, **kwargs)
+ # prevent trying to convert non-geometry objects
if s.dtype != object:
- if s.empty and s.dtype == 'float64' or data is None:
+ if (s.empty and s.dtype == "float64") or data is None:
+ # pd.Series with empty data gives float64 for older pandas versions
s = s.astype(object)
else:
raise TypeError(
- f"Non geometry data passed to GeoSeries constructor, received data of dtype '{s.dtype}'"
- )
+ "Non geometry data passed to GeoSeries constructor, "
+ f"received data of dtype '{s.dtype}'"
+ )
+ # extract object-dtype numpy array from pandas Series; with CoW this
+ # gives a read-only array, so we try to set the flag back to writeable
data = s.to_numpy()
try:
data.flags.writeable = True
except ValueError:
pass
+ # try to convert to GeometryArray
try:
data = from_shapely(data, crs)
except TypeError:
raise TypeError(
- f"Non geometry data passed to GeoSeries constructor, received data of dtype '{s.dtype}'"
- )
+ "Non geometry data passed to GeoSeries constructor, "
+ f"received data of dtype '{s.dtype}'"
+ )
index = s.index
name = s.name
+
super().__init__(data, index=index, name=name, **kwargs)
if not self.crs:
self.crs = crs
+ def append(self, *args, **kwargs) -> GeoSeries:
+ return self._wrapped_pandas_method("append", *args, **kwargs)
+
+ @GeoPandasBase.crs.setter
+ def crs(self, value):
+ if self.crs is not None:
+ warnings.warn(
+ "Overriding the CRS of a GeoSeries that already has CRS. "
+ "This unsafe behavior will be deprecated in future versions. "
+ "Use GeoSeries.set_crs method instead.",
+ stacklevel=2,
+ category=DeprecationWarning,
+ )
+ self.geometry.values.crs = value
+
+ @property
+ def geometry(self) -> GeoSeries:
+ return self
+
@property
- def x(self) ->Series:
+ def x(self) -> Series:
"""Return the x location of point geometries in a GeoSeries
Returns
@@ -193,10 +278,10 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.z
"""
- pass
+ return _delegate_property("x", self)
@property
- def y(self) ->Series:
+ def y(self) -> Series:
"""Return the y location of point geometries in a GeoSeries
Returns
@@ -221,10 +306,10 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.z
"""
- pass
+ return _delegate_property("y", self)
@property
- def z(self) ->Series:
+ def z(self) -> Series:
"""Return the z location of point geometries in a GeoSeries
Returns
@@ -249,11 +334,10 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.y
"""
- pass
+ return _delegate_property("z", self)
@classmethod
- def from_file(cls, filename: (os.PathLike | typing.IO), **kwargs
- ) ->GeoSeries:
+ def from_file(cls, filename: os.PathLike | typing.IO, **kwargs) -> GeoSeries:
"""Alternate constructor to create a ``GeoSeries`` from a file.
Can load a ``GeoSeries`` from a file from any format recognized by
@@ -289,11 +373,16 @@ class GeoSeries(GeoPandasBase, Series):
--------
read_file : read file to GeoDataFrame
"""
- pass
+ from geopandas import GeoDataFrame
+
+ df = GeoDataFrame.from_file(filename, **kwargs)
+
+ return GeoSeries(df.geometry, crs=df.crs)
@classmethod
- def from_wkb(cls, data, index=None, crs: Optional[Any]=None, on_invalid
- ='raise', **kwargs) ->GeoSeries:
+ def from_wkb(
+ cls, data, index=None, crs: Optional[Any] = None, on_invalid="raise", **kwargs
+ ) -> GeoSeries:
"""
Alternate constructor to create a ``GeoSeries``
from a list or array of WKB objects
@@ -328,11 +417,14 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.from_wkt
"""
- pass
+ return cls._from_wkb_or_wkt(
+ from_wkb, data, index=index, crs=crs, on_invalid=on_invalid, **kwargs
+ )
@classmethod
- def from_wkt(cls, data, index=None, crs: Optional[Any]=None, on_invalid
- ='raise', **kwargs) ->GeoSeries:
+ def from_wkt(
+ cls, data, index=None, crs: Optional[Any] = None, on_invalid="raise", **kwargs
+ ) -> GeoSeries:
"""
Alternate constructor to create a ``GeoSeries``
from a list or array of WKT objects
@@ -382,10 +474,12 @@ class GeoSeries(GeoPandasBase, Series):
2 POINT (3 3)
dtype: geometry
"""
- pass
+ return cls._from_wkb_or_wkt(
+ from_wkt, data, index=index, crs=crs, on_invalid=on_invalid, **kwargs
+ )
@classmethod
- def from_xy(cls, x, y, z=None, index=None, crs=None, **kwargs) ->GeoSeries:
+ def from_xy(cls, x, y, z=None, index=None, crs=None, **kwargs) -> GeoSeries:
"""
Alternate constructor to create a :class:`~geopandas.GeoSeries` of Point
geometries from lists or arrays of x, y(, z) coordinates
@@ -429,17 +523,41 @@ class GeoSeries(GeoPandasBase, Series):
2 POINT (-3 1.5)
dtype: geometry
"""
- pass
+ if index is None:
+ if (
+ isinstance(x, Series)
+ and isinstance(y, Series)
+ and x.index.equals(y.index)
+ and (z is None or (isinstance(z, Series) and x.index.equals(z.index)))
+ ): # check if we can reuse index
+ index = x.index
+ return cls(points_from_xy(x, y, z, crs=crs), index=index, crs=crs, **kwargs)
@classmethod
- def _from_wkb_or_wkt(cls, from_wkb_or_wkt_function: Callable, data,
- index=None, crs: Optional[Any]=None, on_invalid: str='raise', **kwargs
- ) ->GeoSeries:
+ def _from_wkb_or_wkt(
+ cls,
+ from_wkb_or_wkt_function: Callable,
+ data,
+ index=None,
+ crs: Optional[Any] = None,
+ on_invalid: str = "raise",
+ **kwargs,
+ ) -> GeoSeries:
"""Create a GeoSeries from either WKT or WKB values"""
- pass
+ if isinstance(data, Series):
+ if index is not None:
+ data = data.reindex(index)
+ else:
+ index = data.index
+ data = data.values
+ return cls(
+ from_wkb_or_wkt_function(data, crs=crs, on_invalid=on_invalid),
+ index=index,
+ **kwargs,
+ )
@classmethod
- def from_arrow(cls, arr, **kwargs) ->GeoSeries:
+ def from_arrow(cls, arr, **kwargs) -> GeoSeries:
"""
Construct a GeoSeries from a Arrow array object with a GeoArrow
extension type.
@@ -469,10 +587,12 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries
"""
- pass
+ from geopandas.io._geoarrow import arrow_to_geometry_array
+
+ return cls(arrow_to_geometry_array(arr), **kwargs)
@property
- def __geo_interface__(self) ->Dict:
+ def __geo_interface__(self) -> Dict:
"""Returns a ``GeoSeries`` as a python feature collection.
Implements the `geo_interface`. The returned python data structure
@@ -486,13 +606,25 @@ class GeoSeries(GeoPandasBase, Series):
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
>>> s.__geo_interface__
- {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', 'properties': {}, 'geometry': {'type': 'Point', 'coordinates': (1.0, 1.0)}, 'bbox': (1.0, 1.0, 1.0, 1.0)}, {'id': '1', 'type': 'Feature', 'properties': {}, 'geometry': {'type': 'Point', 'coordinates': (2.0, 2.0)}, 'bbox': (2.0, 2.0, 2.0, 2.0)}, {'id': '2', 'type': 'Feature', 'properties': {}, 'geometry': {'type': 'Point', 'coordinates': (3.0, 3.0)}, 'bbox': (3.0, 3.0, 3.0, 3.0)}], 'bbox': (1.0, 1.0, 3.0, 3.0)}
+ {'type': 'FeatureCollection', 'features': [{'id': '0', 'type': 'Feature', \
+'properties': {}, 'geometry': {'type': 'Point', 'coordinates': (1.0, 1.0)}, \
+'bbox': (1.0, 1.0, 1.0, 1.0)}, {'id': '1', 'type': 'Feature', \
+'properties': {}, 'geometry': {'type': 'Point', 'coordinates': (2.0, 2.0)}, \
+'bbox': (2.0, 2.0, 2.0, 2.0)}, {'id': '2', 'type': 'Feature', 'properties': \
+{}, 'geometry': {'type': 'Point', 'coordinates': (3.0, 3.0)}, 'bbox': (3.0, \
+3.0, 3.0, 3.0)}], 'bbox': (1.0, 1.0, 3.0, 3.0)}
"""
from geopandas import GeoDataFrame
- return GeoDataFrame({'geometry': self}).__geo_interface__
- def to_file(self, filename: (os.PathLike | typing.IO), driver: Optional
- [str]=None, index: Optional[bool]=None, **kwargs):
+ return GeoDataFrame({"geometry": self}).__geo_interface__
+
+ def to_file(
+ self,
+ filename: os.PathLike | typing.IO,
+ driver: Optional[str] = None,
+ index: Optional[bool] = None,
+ **kwargs,
+ ):
"""Write the ``GeoSeries`` to a file.
By default, an ESRI shapefile is written, but any OGR data source
@@ -554,16 +686,76 @@ class GeoSeries(GeoPandasBase, Series):
>>> s.to_file('series.geojson', driver='GeoJSON') # doctest: +SKIP
"""
- pass
+ from geopandas import GeoDataFrame
+
+ data = GeoDataFrame({"geometry": self}, index=self.index)
+ data.to_file(filename, driver, index=index, **kwargs)
+
+ #
+ # Implement pandas methods
+ #
+
+ @property
+ def _constructor(self):
+ return _geoseries_constructor_with_fallback
+
+ def _constructor_from_mgr(self, mgr, axes):
+ assert isinstance(mgr, SingleBlockManager)
+
+ if not isinstance(mgr.blocks[0].dtype, GeometryDtype):
+ return Series._from_mgr(mgr, axes)
+
+ return GeoSeries._from_mgr(mgr, axes)
+
+ @property
+ def _constructor_expanddim(self):
+ return _geoseries_expanddim
+
+ def _constructor_expanddim_from_mgr(self, mgr, axes):
+ df = pd.DataFrame._from_mgr(mgr, axes)
+ return _expanddim_logic(df)
def _wrapped_pandas_method(self, mtd, *args, **kwargs):
"""Wrap a generic pandas method to ensure it returns a GeoSeries"""
- pass
+ val = getattr(super(), mtd)(*args, **kwargs)
+ if type(val) == Series:
+ val.__class__ = GeoSeries
+ val.crs = self.crs
+ return val
def __getitem__(self, key):
- return self._wrapped_pandas_method('__getitem__', key)
+ return self._wrapped_pandas_method("__getitem__", key)
- def isna(self) ->Series:
+ @doc(pd.Series)
+ def sort_index(self, *args, **kwargs):
+ return self._wrapped_pandas_method("sort_index", *args, **kwargs)
+
+ @doc(pd.Series)
+ def take(self, *args, **kwargs):
+ return self._wrapped_pandas_method("take", *args, **kwargs)
+
+ @doc(pd.Series)
+ def select(self, *args, **kwargs):
+ return self._wrapped_pandas_method("select", *args, **kwargs)
+
+ @doc(pd.Series)
+ def apply(self, func, convert_dtype: Optional[bool] = None, args=(), **kwargs):
+ if convert_dtype is not None:
+ kwargs["convert_dtype"] = convert_dtype
+ else:
+ # if compat.PANDAS_GE_21 don't pass through, use pandas default
+ # of true to avoid internally triggering the pandas warning
+ if not compat.PANDAS_GE_21:
+ kwargs["convert_dtype"] = True
+
+ # to avoid warning
+ result = super().apply(func, args=args, **kwargs)
+ if isinstance(result, GeoSeries):
+ if self.crs is not None:
+ result.set_crs(self.crs, inplace=True)
+ return result
+
+ def isna(self) -> Series:
"""
Detect missing values.
@@ -602,13 +794,13 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.notna : inverse of isna
GeoSeries.is_empty : detect empty geometries
"""
- pass
+ return super().isna()
- def isnull(self) ->Series:
+ def isnull(self) -> Series:
"""Alias for `isna` method. See `isna` for more detail."""
- pass
+ return self.isna()
- def notna(self) ->Series:
+ def notna(self) -> Series:
"""
Detect non-missing values.
@@ -647,13 +839,27 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.isna : inverse of notna
GeoSeries.is_empty : detect empty geometries
"""
- pass
-
- def notnull(self) ->Series:
+ if self.is_empty.any():
+ warnings.warn(
+ "GeoSeries.notna() previously returned False for both missing (None) "
+ "and empty geometries. Now, it only returns False for missing values. "
+ "Since the calling GeoSeries contains empty geometries, the result "
+ "has changed compared to previous versions of GeoPandas.\n"
+ "Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get "
+ "back the old behaviour.\n\n"
+ "To further ignore this warning, you can do: \n"
+ "import warnings; warnings.filterwarnings('ignore', "
+ "'GeoSeries.notna', UserWarning)",
+ UserWarning,
+ stacklevel=2,
+ )
+ return super().notna()
+
+ def notnull(self) -> Series:
"""Alias for `notna` method. See `notna` for more detail."""
- pass
+ return self.notna()
- def fillna(self, value=None, inplace: bool=False, limit=None, **kwargs):
+ def fillna(self, value=None, inplace: bool = False, limit=None, **kwargs):
"""
Fill NA values with geometry (or geometries).
@@ -726,9 +932,11 @@ class GeoSeries(GeoPandasBase, Series):
--------
GeoSeries.isna : detect missing values
"""
- pass
+ if value is None:
+ value = GeometryCollection()
+ return super().fillna(value=value, limit=limit, inplace=inplace, **kwargs)
- def __contains__(self, other) ->bool:
+ def __contains__(self, other) -> bool:
"""Allow tests of the form "geom in s"
Tests whether a GeoSeries contains a geometry.
@@ -740,12 +948,16 @@ class GeoSeries(GeoPandasBase, Series):
else:
return False
+ @doc(plot_series)
+ def plot(self, *args, **kwargs):
+ return plot_series(self, *args, **kwargs)
+
@doc(_explore_geoseries)
def explore(self, *args, **kwargs):
"""Interactive map based on folium/leaflet.js"""
- pass
+ return _explore_geoseries(self, *args, **kwargs)
- def explode(self, ignore_index=False, index_parts=False) ->GeoSeries:
+ def explode(self, ignore_index=False, index_parts=False) -> GeoSeries:
"""
Explode multi-part geometries into multiple single geometries.
@@ -794,11 +1006,30 @@ class GeoSeries(GeoPandasBase, Series):
GeoDataFrame.explode
"""
- pass
+ from .base import _get_index_for_parts
+
+ geometries, outer_idx = shapely.get_parts(self.values._data, return_index=True)
+ index = _get_index_for_parts(
+ self.index,
+ outer_idx,
+ ignore_index=ignore_index,
+ index_parts=index_parts,
+ )
+
+ return GeoSeries(geometries, index=index, crs=self.crs).__finalize__(self)
+
+ #
+ # Additional methods
+ #
@compat.requires_pyproj
- def set_crs(self, crs: Optional[Any]=None, epsg: Optional[int]=None,
- inplace: bool=False, allow_override: bool=False):
+ def set_crs(
+ self,
+ crs: Optional[Any] = None,
+ epsg: Optional[int] = None,
+ inplace: bool = False,
+ allow_override: bool = False,
+ ):
"""
Set the Coordinate Reference System (CRS) of a ``GeoSeries``.
@@ -873,10 +1104,30 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.to_crs : re-project to another CRS
"""
- pass
+ from pyproj import CRS
+
+ if crs is not None:
+ crs = CRS.from_user_input(crs)
+ elif epsg is not None:
+ crs = CRS.from_epsg(epsg)
+
+ if not allow_override and self.crs is not None and not self.crs == crs:
+ raise ValueError(
+ "The GeoSeries already has a CRS which is not equal to the passed "
+ "CRS. Specify 'allow_override=True' to allow replacing the existing "
+ "CRS without doing any transformation. If you actually want to "
+ "transform the geometries, use 'GeoSeries.to_crs' instead."
+ )
+ if not inplace:
+ result = self.copy()
+ else:
+ result = self
+ result.array.crs = crs
+ return result
- def to_crs(self, crs: Optional[Any]=None, epsg: Optional[int]=None
- ) ->GeoSeries:
+ def to_crs(
+ self, crs: Optional[Any] = None, epsg: Optional[int] = None
+ ) -> GeoSeries:
"""Returns a ``GeoSeries`` with all geometries transformed to a new
coordinate reference system.
@@ -952,9 +1203,11 @@ class GeoSeries(GeoPandasBase, Series):
GeoSeries.set_crs : assign CRS
"""
- pass
+ return GeoSeries(
+ self.values.to_crs(crs=crs, epsg=epsg), index=self.index, name=self.name
+ )
- def estimate_utm_crs(self, datum_name: str='WGS 84'):
+ def estimate_utm_crs(self, datum_name: str = "WGS 84"):
"""Returns the estimated UTM CRS based on the bounds of the dataset.
.. versionadded:: 0.9
@@ -990,10 +1243,15 @@ class GeoSeries(GeoPandasBase, Series):
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
"""
- pass
-
- def to_json(self, show_bbox: bool=True, drop_id: bool=False, to_wgs84:
- bool=False, **kwargs) ->str:
+ return self.values.estimate_utm_crs(datum_name)
+
+ def to_json(
+ self,
+ show_bbox: bool = True,
+ drop_id: bool = False,
+ to_wgs84: bool = False,
+ **kwargs,
+ ) -> str:
"""
Returns a GeoJSON string representation of the GeoSeries.
@@ -1029,15 +1287,22 @@ class GeoSeries(GeoPandasBase, Series):
dtype: geometry
>>> s.to_json()
- '{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [1.0, 1.0]}, "bbox": [1.0, 1.0, 1.0, 1.0]}, {"id": "1", "type": "Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [2.0, 2.0]}, "bbox": [2.0, 2.0, 2.0, 2.0]}, {"id": "2", "type": "Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [3.0, 3.0]}, "bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1.0, 1.0, 3.0, 3.0]}'
+ '{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "pr\
+operties": {}, "geometry": {"type": "Point", "coordinates": [1.0, 1.0]}, "bbox": [1.0,\
+ 1.0, 1.0, 1.0]}, {"id": "1", "type": "Feature", "properties": {}, "geometry": {"type"\
+: "Point", "coordinates": [2.0, 2.0]}, "bbox": [2.0, 2.0, 2.0, 2.0]}, {"id": "2", "typ\
+e": "Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [3.0, 3.\
+0]}, "bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1.0, 1.0, 3.0, 3.0]}'
See Also
--------
GeoSeries.to_file : write GeoSeries to file
"""
- pass
+ return self.to_frame("geometry").to_json(
+ na="null", show_bbox=show_bbox, drop_id=drop_id, to_wgs84=to_wgs84, **kwargs
+ )
- def to_wkb(self, hex: bool=False, **kwargs) ->Series:
+ def to_wkb(self, hex: bool = False, **kwargs) -> Series:
"""
Convert GeoSeries geometries to WKB
@@ -1059,9 +1324,9 @@ class GeoSeries(GeoPandasBase, Series):
--------
GeoSeries.to_wkt
"""
- pass
+ return Series(to_wkb(self.array, hex=hex, **kwargs), index=self.index)
- def to_wkt(self, **kwargs) ->Series:
+ def to_wkt(self, **kwargs) -> Series:
"""
Convert GeoSeries geometries to WKT
@@ -1095,10 +1360,9 @@ class GeoSeries(GeoPandasBase, Series):
--------
GeoSeries.to_wkb
"""
- pass
+ return Series(to_wkt(self.array, **kwargs), index=self.index)
- def to_arrow(self, geometry_encoding='WKB', interleaved=True, include_z
- =None):
+ def to_arrow(self, geometry_encoding="WKB", interleaved=True, include_z=None):
"""Encode a GeoSeries to GeoArrow format.
See https://geoarrow.org/ for details on the GeoArrow specification.
@@ -1163,9 +1427,40 @@ class GeoSeries(GeoPandasBase, Series):
]
"""
- pass
+ import pyarrow as pa
+
+ from geopandas.io._geoarrow import (
+ GeoArrowArray,
+ construct_geometry_array,
+ construct_wkb_array,
+ )
+
+ field_name = self.name if self.name is not None else ""
+
+ if geometry_encoding.lower() == "geoarrow":
+ if Version(pa.__version__) < Version("10.0.0"):
+ raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
+
+ field, geom_arr = construct_geometry_array(
+ np.array(self.array),
+ include_z=include_z,
+ field_name=field_name,
+ crs=self.crs,
+ interleaved=interleaved,
+ )
+ elif geometry_encoding.lower() == "wkb":
+ field, geom_arr = construct_wkb_array(
+ np.asarray(self.array), field_name=field_name, crs=self.crs
+ )
+ else:
+ raise ValueError(
+ "Expected geometry encoding 'WKB' or 'geoarrow' "
+ f"got {geometry_encoding}"
+ )
+
+ return GeoArrowArray(field, geom_arr)
- def clip(self, mask, keep_geom_type: bool=False, sort=False) ->GeoSeries:
+ def clip(self, mask, keep_geom_type: bool = False, sort=False) -> GeoSeries:
"""Clip points, lines, or polygon geometries to the mask extent.
Both layers must be in the same Coordinate Reference System (CRS).
@@ -1222,4 +1517,4 @@ class GeoSeries(GeoPandasBase, Series):
>>> nws_groceries.shape
(7,)
"""
- pass
+ return geopandas.clip(self, mask=mask, keep_geom_type=keep_geom_type, sort=sort)
diff --git a/geopandas/io/_geoarrow.py b/geopandas/io/_geoarrow.py
index cb4401fd..32ccf519 100644
--- a/geopandas/io/_geoarrow.py
+++ b/geopandas/io/_geoarrow.py
@@ -1,17 +1,30 @@
import json
from packaging.version import Version
from typing import Dict, Optional, Tuple
+
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray
+
import shapely
from shapely import GeometryType
+
from geopandas import GeoDataFrame
from geopandas._compat import SHAPELY_GE_204
from geopandas.array import from_shapely, from_wkb
-GEOARROW_ENCODINGS = ['point', 'linestring', 'polygon', 'multipoint',
- 'multilinestring', 'multipolygon']
+
+GEOARROW_ENCODINGS = [
+ "point",
+ "linestring",
+ "polygon",
+ "multipoint",
+ "multilinestring",
+ "multipolygon",
+]
+
+
+## GeoPandas -> GeoArrow
class ArrowTable:
@@ -35,8 +48,7 @@ class ArrowTable:
self._pa_table = pa_table
def __arrow_c_stream__(self, requested_schema=None):
- return self._pa_table.__arrow_c_stream__(requested_schema=
- requested_schema)
+ return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
class GeoArrowArray:
@@ -63,13 +75,21 @@ class GeoArrowArray:
def __arrow_c_array__(self, requested_schema=None):
if requested_schema is not None:
raise NotImplementedError(
- 'Requested schema is not supported for geometry arrays')
- return self._pa_field.__arrow_c_schema__(
- ), self._pa_array.__arrow_c_array__()[1]
+ "Requested schema is not supported for geometry arrays"
+ )
+ return (
+ self._pa_field.__arrow_c_schema__(),
+ self._pa_array.__arrow_c_array__()[1],
+ )
-def geopandas_to_arrow(df, index=None, geometry_encoding='WKB', interleaved
- =True, include_z=None):
+def geopandas_to_arrow(
+ df,
+ index=None,
+ geometry_encoding="WKB",
+ interleaved=True,
+ include_z=None,
+):
"""
Convert GeoDataFrame to a pyarrow.Table.
@@ -101,7 +121,330 @@ def geopandas_to_arrow(df, index=None, geometry_encoding='WKB', interleaved
specify the keyword).
"""
- pass
+ mask = df.dtypes == "geometry"
+ geometry_columns = df.columns[mask]
+ geometry_indices = np.asarray(mask).nonzero()[0]
+
+ df_attr = pd.DataFrame(df.copy(deep=False))
+
+ # replace geometry columns with dummy values -> will get converted to
+ # Arrow null column (not holding any memory), so we can afterwards
+ # fill the resulting table with the correct geometry fields
+ for col in geometry_columns:
+ df_attr[col] = None
+
+ table = pa.Table.from_pandas(df_attr, preserve_index=index)
+
+ geometry_encoding_dict = {}
+
+ if geometry_encoding.lower() == "geoarrow":
+ if Version(pa.__version__) < Version("10.0.0"):
+ raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
+
+ # Encode all geometry columns to GeoArrow
+ for i, col in zip(geometry_indices, geometry_columns):
+ field, geom_arr = construct_geometry_array(
+ np.array(df[col].array),
+ include_z=include_z,
+ field_name=col,
+ crs=df[col].crs,
+ interleaved=interleaved,
+ )
+ table = table.set_column(i, field, geom_arr)
+ geometry_encoding_dict[col] = (
+ field.metadata[b"ARROW:extension:name"]
+ .decode()
+ .removeprefix("geoarrow.")
+ )
+
+ elif geometry_encoding.lower() == "wkb":
+ # Encode all geometry columns to WKB
+ for i, col in zip(geometry_indices, geometry_columns):
+ field, wkb_arr = construct_wkb_array(
+ np.asarray(df[col].array), field_name=col, crs=df[col].crs
+ )
+ table = table.set_column(i, field, wkb_arr)
+ geometry_encoding_dict[col] = "WKB"
+
+ else:
+ raise ValueError(
+ f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
+ )
+ return table, geometry_encoding_dict
+
+
+def construct_wkb_array(
+ shapely_arr: NDArray[np.object_],
+ *,
+ field_name: str = "geometry",
+ crs: Optional[str] = None,
+) -> Tuple[pa.Field, pa.Array]:
+
+ if shapely.geos_version > (3, 10, 0):
+ kwargs = {"flavor": "iso"}
+ else:
+ if shapely.has_z(shapely_arr).any():
+ raise ValueError("Cannot write 3D geometries with GEOS<3.10")
+ kwargs = {}
+
+ wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
+ extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
+ if crs is not None:
+ extension_metadata["ARROW:extension:metadata"] = json.dumps(
+ {"crs": crs.to_json()}
+ )
+ else:
+ # In theory this should not be needed, but otherwise pyarrow < 17
+ # crashes on receiving such data through C Data Interface
+ # https://github.com/apache/arrow/issues/41741
+ extension_metadata["ARROW:extension:metadata"] = "{}"
+
+ field = pa.field(
+ field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
+ )
+ parr = pa.array(np.asarray(wkb_arr), pa.binary())
+ return field, parr
+
+
+def _convert_inner_coords(coords, interleaved, dims, mask=None):
+ if interleaved:
+ coords_field = pa.field(dims, pa.float64(), nullable=False)
+ typ = pa.list_(coords_field, len(dims))
+ if mask is None:
+ # mask keyword only added in pyarrow 15.0.0
+ parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
+ else:
+ parr = pa.FixedSizeListArray.from_arrays(
+ coords.ravel(), type=typ, mask=mask
+ )
+ else:
+ if dims == "xy":
+ fields = [
+ pa.field("x", pa.float64(), nullable=False),
+ pa.field("y", pa.float64(), nullable=False),
+ ]
+ parr = pa.StructArray.from_arrays(
+ [coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
+ )
+ else:
+ fields = [
+ pa.field("x", pa.float64(), nullable=False),
+ pa.field("y", pa.float64(), nullable=False),
+ pa.field("z", pa.float64(), nullable=False),
+ ]
+ parr = pa.StructArray.from_arrays(
+ [coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
+ fields=fields,
+ mask=mask,
+ )
+ return parr
+
+
+def _linestring_type(point_type):
+ return pa.list_(pa.field("vertices", point_type, nullable=False))
+
+
+def _polygon_type(point_type):
+ return pa.list_(
+ pa.field(
+ "rings",
+ pa.list_(pa.field("vertices", point_type, nullable=False)),
+ nullable=False,
+ )
+ )
+
+
+def _multipoint_type(point_type):
+ return pa.list_(pa.field("points", point_type, nullable=False))
+
+
+def _multilinestring_type(point_type):
+ return pa.list_(
+ pa.field("linestrings", _linestring_type(point_type), nullable=False)
+ )
+
+
+def _multipolygon_type(point_type):
+ return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
+
+
+def construct_geometry_array(
+ shapely_arr: NDArray[np.object_],
+ include_z: Optional[bool] = None,
+ *,
+ field_name: str = "geometry",
+ crs: Optional[str] = None,
+ interleaved: bool = True,
+) -> Tuple[pa.Field, pa.Array]:
+ # NOTE: this implementation returns a (field, array) pair so that it can set the
+ # extension metadata on the field without instantiating extension types into the
+ # global pyarrow registry
+ geom_type, coords, offsets = shapely.to_ragged_array(
+ shapely_arr, include_z=include_z
+ )
+
+ mask = shapely.is_missing(shapely_arr)
+ if mask.any():
+ if (
+ geom_type == GeometryType.POINT
+ and interleaved
+ and Version(pa.__version__) < Version("15.0.0")
+ ):
+ raise ValueError(
+ "Converting point geometries with missing values is not supported "
+ "for interleaved coordinates with pyarrow < 15.0.0. Please "
+ "upgrade to a newer version of pyarrow."
+ )
+ mask = pa.array(mask, type=pa.bool_())
+
+ if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
+ # bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
+ # this workaround only works if there are no empty points
+ indices = np.nonzero(mask)[0]
+ indices = indices - np.arange(len(indices))
+ coords = np.insert(coords, indices, np.nan, axis=0)
+
+ else:
+ mask = None
+
+ if coords.shape[-1] == 2:
+ dims = "xy"
+ elif coords.shape[-1] == 3:
+ dims = "xyz"
+ else:
+ raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
+
+ extension_metadata: Dict[str, str] = {}
+ if crs is not None:
+ extension_metadata["ARROW:extension:metadata"] = json.dumps(
+ {"crs": crs.to_json()}
+ )
+ else:
+ # In theory this should not be needed, but otherwise pyarrow < 17
+ # crashes on receiving such data through C Data Interface
+ # https://github.com/apache/arrow/issues/41741
+ extension_metadata["ARROW:extension:metadata"] = "{}"
+
+ if geom_type == GeometryType.POINT:
+ parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
+ extension_metadata["ARROW:extension:name"] = "geoarrow.point"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ elif geom_type == GeometryType.LINESTRING:
+ assert len(offsets) == 1, "Expected one offsets array"
+ (geom_offsets,) = offsets
+ _parr = _convert_inner_coords(coords, interleaved, dims)
+ parr = pa.ListArray.from_arrays(
+ pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
+ )
+ extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ elif geom_type == GeometryType.POLYGON:
+ assert len(offsets) == 2, "Expected two offsets arrays"
+ ring_offsets, geom_offsets = offsets
+ _parr = _convert_inner_coords(coords, interleaved, dims)
+ _parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
+ parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
+ parr = parr.cast(_polygon_type(_parr.type))
+ extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ elif geom_type == GeometryType.MULTIPOINT:
+ assert len(offsets) == 1, "Expected one offsets array"
+ (geom_offsets,) = offsets
+ _parr = _convert_inner_coords(coords, interleaved, dims)
+ parr = pa.ListArray.from_arrays(
+ pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
+ )
+ extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ elif geom_type == GeometryType.MULTILINESTRING:
+ assert len(offsets) == 2, "Expected two offsets arrays"
+ ring_offsets, geom_offsets = offsets
+ _parr = _convert_inner_coords(coords, interleaved, dims)
+ _parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
+ parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
+ parr = parr.cast(_multilinestring_type(_parr.type))
+ extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ elif geom_type == GeometryType.MULTIPOLYGON:
+ assert len(offsets) == 3, "Expected three offsets arrays"
+ ring_offsets, polygon_offsets, geom_offsets = offsets
+ _parr = _convert_inner_coords(coords, interleaved, dims)
+ _parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
+ _parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
+ parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
+ parr = parr.cast(_multipolygon_type(_parr.type))
+ extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
+ field = pa.field(
+ field_name,
+ parr.type,
+ nullable=True,
+ metadata=extension_metadata,
+ )
+ return field, parr
+
+ else:
+ raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
+
+
+## GeoArrow -> GeoPandas
+
+
+def _get_arrow_geometry_field(field):
+ if (meta := field.metadata) is not None:
+ if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
+ if ext_name.startswith(b"geoarrow."):
+ if (
+ ext_meta := meta.get(b"ARROW:extension:metadata", None)
+ ) is not None:
+ ext_meta = json.loads(ext_meta.decode())
+ return ext_name.decode(), ext_meta
+
+ if isinstance(field.type, pa.ExtensionType):
+ ext_name = field.type.extension_name
+ if ext_name.startswith("geoarrow."):
+ ext_meta_ser = field.type.__arrow_ext_serialize__()
+ if ext_meta_ser:
+ ext_meta = json.loads(ext_meta_ser.decode())
+ else:
+ ext_meta = None
+ return ext_name, ext_meta
+
+ return None
def arrow_to_geopandas(table, geometry=None):
@@ -121,7 +464,40 @@ def arrow_to_geopandas(table, geometry=None):
GeoDataFrame
"""
- pass
+ if not isinstance(table, pa.Table):
+ table = pa.table(table)
+
+ geom_fields = []
+
+ for i, field in enumerate(table.schema):
+ geom = _get_arrow_geometry_field(field)
+ if geom is not None:
+ geom_fields.append((i, field.name, *geom))
+
+ if len(geom_fields) == 0:
+ raise ValueError("No geometry column found in the Arrow table.")
+
+ table_attr = table.drop([f[1] for f in geom_fields])
+ df = table_attr.to_pandas()
+
+ for i, col, ext_name, ext_meta in geom_fields:
+ crs = None
+ if ext_meta is not None and "crs" in ext_meta:
+ crs = ext_meta["crs"]
+
+ if ext_name == "geoarrow.wkb":
+ geom_arr = from_wkb(np.array(table[col]), crs=crs)
+ elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
+
+ geom_arr = from_shapely(
+ construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
+ )
+ else:
+ raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
+
+ df.insert(i, col, geom_arr)
+
+ return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
def arrow_to_geometry_array(arr):
@@ -131,7 +507,51 @@ def arrow_to_geometry_array(arr):
Specifically for GeoSeries.from_arrow.
"""
- pass
+ if Version(pa.__version__) < Version("14.0.0"):
+ raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
+
+ schema_capsule, array_capsule = arr.__arrow_c_array__()
+ field = pa.Field._import_from_c_capsule(schema_capsule)
+ pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
+
+ geom_info = _get_arrow_geometry_field(field)
+ if geom_info is None:
+ raise ValueError("No GeoArrow geometry field found.")
+ ext_name, ext_meta = geom_info
+
+ crs = None
+ if ext_meta is not None and "crs" in ext_meta:
+ crs = ext_meta["crs"]
+
+ if ext_name == "geoarrow.wkb":
+ geom_arr = from_wkb(np.array(pa_arr), crs=crs)
+ elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
+
+ geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
+ else:
+ raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
+
+ return geom_arr
+
+
+def _get_inner_coords(arr):
+ if pa.types.is_struct(arr.type):
+ if arr.type.num_fields == 2:
+ coords = np.column_stack(
+ [np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
+ )
+ else:
+ coords = np.column_stack(
+ [
+ np.asarray(arr.field("x")),
+ np.asarray(arr.field("y")),
+ np.asarray(arr.field("z")),
+ ]
+ )
+ return coords
+ else:
+ # fixed size list
+ return np.asarray(arr.values).reshape(len(arr), -1)
def construct_shapely_array(arr: pa.Array, extension_name: str):
@@ -140,4 +560,55 @@ def construct_shapely_array(arr: pa.Array, extension_name: str):
with GeoArrow extension type.
"""
- pass
+ if isinstance(arr, pa.ExtensionArray):
+ arr = arr.storage
+
+ if extension_name == "geoarrow.point":
+ coords = _get_inner_coords(arr)
+ result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
+
+ elif extension_name == "geoarrow.linestring":
+ coords = _get_inner_coords(arr.values)
+ offsets1 = np.asarray(arr.offsets)
+ offsets = (offsets1,)
+ result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
+
+ elif extension_name == "geoarrow.polygon":
+ coords = _get_inner_coords(arr.values.values)
+ offsets2 = np.asarray(arr.offsets)
+ offsets1 = np.asarray(arr.values.offsets)
+ offsets = (offsets1, offsets2)
+ result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
+
+ elif extension_name == "geoarrow.multipoint":
+ coords = _get_inner_coords(arr.values)
+ offsets1 = np.asarray(arr.offsets)
+ offsets = (offsets1,)
+ result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
+
+ elif extension_name == "geoarrow.multilinestring":
+ coords = _get_inner_coords(arr.values.values)
+ offsets2 = np.asarray(arr.offsets)
+ offsets1 = np.asarray(arr.values.offsets)
+ offsets = (offsets1, offsets2)
+ result = shapely.from_ragged_array(
+ GeometryType.MULTILINESTRING, coords, offsets
+ )
+
+ elif extension_name == "geoarrow.multipolygon":
+ coords = _get_inner_coords(arr.values.values.values)
+ offsets3 = np.asarray(arr.offsets)
+ offsets2 = np.asarray(arr.values.offsets)
+ offsets1 = np.asarray(arr.values.values.offsets)
+ offsets = (offsets1, offsets2, offsets3)
+ result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
+
+ else:
+ raise ValueError(extension_name)
+
+ # apply validity mask
+ if arr.null_count:
+ mask = np.asarray(arr.is_null())
+ result = np.where(mask, None, result)
+
+ return result
diff --git a/geopandas/io/_pyarrow_hotfix.py b/geopandas/io/_pyarrow_hotfix.py
index f8586c5c..731db2d6 100644
--- a/geopandas/io/_pyarrow_hotfix.py
+++ b/geopandas/io/_pyarrow_hotfix.py
@@ -1,10 +1,12 @@
from packaging.version import Version
+
import pyarrow
-_ERROR_MSG = """Disallowed deserialization of 'arrow.py_extension_type':
+
+_ERROR_MSG = """\
+Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
-pickle disassembly:
-{pickle_disassembly}
+pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
@@ -19,4 +21,52 @@ derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
+
+
+def patch_pyarrow():
+ # starting from pyarrow 14.0.1, it has its own mechanism
+ if Version(pyarrow.__version__) >= Version("14.0.1"):
+ return
+
+ # if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
+ # installed, use this instead (which also ensures it works if they had
+ # called `pyarrow_hotfix.uninstall()`)
+ try:
+ import pyarrow_hotfix # noqa: F401
+ except ImportError:
+ pass
+ else:
+ return
+
+ # if the hotfix is already installed and enabled
+ if getattr(pyarrow, "_hotfix_installed", False):
+ return
+
+ class ForbiddenExtensionType(pyarrow.ExtensionType):
+ def __arrow_ext_serialize__(self):
+ return b""
+
+ @classmethod
+ def __arrow_ext_deserialize__(cls, storage_type, serialized):
+ import io
+ import pickletools
+
+ out = io.StringIO()
+ pickletools.dis(serialized, out)
+ raise RuntimeError(
+ _ERROR_MSG.format(
+ storage_type=storage_type,
+ serialized=serialized,
+ pickle_disassembly=out.getvalue(),
+ )
+ )
+
+ pyarrow.unregister_extension_type("arrow.py_extension_type")
+ pyarrow.register_extension_type(
+ ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
+ )
+
+ pyarrow._hotfix_installed = True
+
+
patch_pyarrow()
diff --git a/geopandas/io/arrow.py b/geopandas/io/arrow.py
index defcba99..53cf77ed 100644
--- a/geopandas/io/arrow.py
+++ b/geopandas/io/arrow.py
@@ -1,19 +1,65 @@
import json
import warnings
from packaging.version import Version
+
import numpy as np
from pandas import DataFrame, Series
+
import shapely
+
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
+
from .file import _expand_user
-METADATA_VERSION = '1.0.0'
-SUPPORTED_VERSIONS = ['0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', '1.1.0']
-GEOARROW_ENCODINGS = ['point', 'linestring', 'polygon', 'multipoint',
- 'multilinestring', 'multipolygon']
-SUPPORTED_ENCODINGS = ['WKB'] + GEOARROW_ENCODINGS
+
+METADATA_VERSION = "1.0.0"
+SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
+GEOARROW_ENCODINGS = [
+ "point",
+ "linestring",
+ "polygon",
+ "multipoint",
+ "multilinestring",
+ "multipolygon",
+]
+SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
+
+# reference: https://github.com/opengeospatial/geoparquet
+
+# Metadata structure:
+# {
+# "geo": {
+# "columns": {
+# "<name>": {
+# "encoding": "WKB"
+# "geometry_types": <list of str: REQUIRED>
+# "crs": "<PROJJSON or None: OPTIONAL>",
+# "orientation": "<'counterclockwise' or None: OPTIONAL>"
+# "edges": "planar"
+# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
+# "epoch": <float: OPTIONAL>
+# }
+# },
+# "primary_column": "<str: REQUIRED>",
+# "version": "<METADATA_VERSION>",
+#
+# # Additional GeoPandas specific metadata (not in metadata spec)
+# "creator": {
+# "library": "geopandas",
+# "version": "<geopandas.__version__>"
+# }
+# }
+# }
+
+
+def _is_fsspec_url(url):
+ return (
+ isinstance(url, str)
+ and "://" in url
+ and not url.startswith(("http://", "https://"))
+ )
def _remove_id_from_member_of_ensembles(json_dict):
@@ -26,24 +72,48 @@ def _remove_id_from_member_of_ensembles(json_dict):
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
- pass
-
-
-_geometry_type_names = ['Point', 'LineString', 'LineString', 'Polygon',
- 'MultiPoint', 'MultiLineString', 'MultiPolygon', 'GeometryCollection']
-_geometry_type_names += [(geom_type + ' Z') for geom_type in
- _geometry_type_names]
+ for key, value in json_dict.items():
+ if isinstance(value, dict):
+ _remove_id_from_member_of_ensembles(value)
+ elif key == "members" and isinstance(value, list):
+ for member in value:
+ member.pop("id", None)
+
+
+# type ids 0 to 7
+_geometry_type_names = [
+ "Point",
+ "LineString",
+ "LineString",
+ "Polygon",
+ "MultiPoint",
+ "MultiLineString",
+ "MultiPolygon",
+ "GeometryCollection",
+]
+_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
- pass
+ arr_geometry_types = shapely.get_type_id(series.array._data)
+ # ensure to include "... Z" for 3D geometries
+ has_z = shapely.has_z(series.array._data)
+ arr_geometry_types[has_z] += 8
+
+ geometry_types = Series(arr_geometry_types).unique().tolist()
+ # drop missing values (shapely.get_type_id returns -1 for those)
+ if -1 in geometry_types:
+ geometry_types.remove(-1)
+ return sorted([_geometry_type_names[idx] for idx in geometry_types])
-def _create_metadata(df, schema_version=None, geometry_encoding=None,
- write_covering_bbox=False):
+
+def _create_metadata(
+ df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
+):
"""Create and encode geo metadata dict.
Parameters
@@ -61,7 +131,67 @@ def _create_metadata(df, schema_version=None, geometry_encoding=None,
-------
dict
"""
- pass
+ if schema_version is None:
+ if geometry_encoding and any(
+ encoding != "WKB" for encoding in geometry_encoding.values()
+ ):
+ schema_version = "1.1.0"
+ else:
+ schema_version = METADATA_VERSION
+
+ if schema_version not in SUPPORTED_VERSIONS:
+ raise ValueError(
+ f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
+ )
+
+ # Construct metadata for each geometry
+ column_metadata = {}
+ for col in df.columns[df.dtypes == "geometry"]:
+ series = df[col]
+
+ geometry_types = _get_geometry_types(series)
+ if schema_version[0] == "0":
+ geometry_types_name = "geometry_type"
+ if len(geometry_types) == 1:
+ geometry_types = geometry_types[0]
+ else:
+ geometry_types_name = "geometry_types"
+
+ crs = None
+ if series.crs:
+ if schema_version == "0.1.0":
+ crs = series.crs.to_wkt()
+ else: # version >= 0.4.0
+ crs = series.crs.to_json_dict()
+ _remove_id_from_member_of_ensembles(crs)
+
+ column_metadata[col] = {
+ "encoding": geometry_encoding[col],
+ "crs": crs,
+ geometry_types_name: geometry_types,
+ }
+
+ bbox = series.total_bounds.tolist()
+ if np.isfinite(bbox).all():
+ # don't add bbox with NaNs for empty / all-NA geometry column
+ column_metadata[col]["bbox"] = bbox
+
+ if write_covering_bbox:
+ column_metadata[col]["covering"] = {
+ "bbox": {
+ "xmin": ["bbox", "xmin"],
+ "ymin": ["bbox", "ymin"],
+ "xmax": ["bbox", "xmax"],
+ "ymax": ["bbox", "ymax"],
+ },
+ }
+
+ return {
+ "primary_column": df._geometry_column_name,
+ "columns": column_metadata,
+ "version": schema_version,
+ "creator": {"library": "geopandas", "version": geopandas.__version__},
+ }
def _encode_metadata(metadata):
@@ -75,7 +205,7 @@ def _encode_metadata(metadata):
-------
UTF-8 encoded JSON string
"""
- pass
+ return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
@@ -89,7 +219,10 @@ def _decode_metadata(metadata_str):
-------
dict
"""
- pass
+ if metadata_str is None:
+ return None
+
+ return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
@@ -104,7 +237,20 @@ def _validate_dataframe(df):
----------
df : GeoDataFrame
"""
- pass
+
+ if not isinstance(df, DataFrame):
+ raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
+
+ # must have value column names (strings only)
+ if df.columns.inferred_type not in {"string", "unicode", "empty"}:
+ raise ValueError("Writing to Parquet/Feather requires string column names")
+
+ # index level names must be strings
+ valid_names = all(
+ isinstance(name, str) for name in df.index.names if name is not None
+ )
+ if not valid_names:
+ raise ValueError("Index level names must be strings")
def _validate_geo_metadata(metadata):
@@ -117,20 +263,129 @@ def _validate_geo_metadata(metadata):
----------
metadata : dict
"""
- pass
-
-def _geopandas_to_arrow(df, index=None, geometry_encoding='WKB',
- schema_version=None, write_covering_bbox=None):
+ if not metadata:
+ raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
+
+ # version was schema_version in 0.1.0
+ version = metadata.get("version", metadata.get("schema_version"))
+ if not version:
+ raise ValueError(
+ "'geo' metadata in Parquet/Feather file is missing required key: "
+ "'version'"
+ )
+
+ required_keys = ("primary_column", "columns")
+ for key in required_keys:
+ if metadata.get(key, None) is None:
+ raise ValueError(
+ "'geo' metadata in Parquet/Feather file is missing required key: "
+ "'{key}'".format(key=key)
+ )
+
+ if not isinstance(metadata["columns"], dict):
+ raise ValueError("'columns' in 'geo' metadata must be a dict")
+
+ # Validate that geometry columns have required metadata and values
+ # leaving out "geometry_type" for compatibility with 0.1
+ required_col_keys = ("encoding",)
+ for col, column_metadata in metadata["columns"].items():
+ for key in required_col_keys:
+ if key not in column_metadata:
+ raise ValueError(
+ "'geo' metadata in Parquet/Feather file is missing required key "
+ "'{key}' for column '{col}'".format(key=key, col=col)
+ )
+
+ if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
+ raise ValueError(
+ "Only WKB geometry encoding or one of the native encodings "
+ f"({GEOARROW_ENCODINGS!r}) are supported, "
+ f"got: {column_metadata['encoding']}"
+ )
+
+ if column_metadata.get("edges", "planar") == "spherical":
+ warnings.warn(
+ f"The geo metadata indicate that column '{col}' has spherical edges, "
+ "but because GeoPandas currently does not support spherical "
+ "geometry, it ignores this metadata and will interpret the edges of "
+ "the geometries as planar.",
+ UserWarning,
+ stacklevel=4,
+ )
+
+ if "covering" in column_metadata:
+ covering = column_metadata["covering"]
+ if "bbox" in covering:
+ bbox = covering["bbox"]
+ for var in ["xmin", "ymin", "xmax", "ymax"]:
+ if var not in bbox.keys():
+ raise ValueError("Metadata for bbox column is malformed.")
+
+
+def _geopandas_to_arrow(
+ df,
+ index=None,
+ geometry_encoding="WKB",
+ schema_version=None,
+ write_covering_bbox=None,
+):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
- pass
-
-
-def _to_parquet(df, path, index=None, compression='snappy',
- geometry_encoding='WKB', schema_version=None, write_covering_bbox=False,
- **kwargs):
+ from pyarrow import StructArray
+
+ from geopandas.io._geoarrow import geopandas_to_arrow
+
+ _validate_dataframe(df)
+
+ if schema_version is not None:
+ if geometry_encoding != "WKB" and schema_version != "1.1.0":
+ raise ValueError(
+ "'geoarrow' encoding is only supported with schema version >= 1.1.0"
+ )
+
+ table, geometry_encoding_dict = geopandas_to_arrow(
+ df, geometry_encoding=geometry_encoding, index=index, interleaved=False
+ )
+ geo_metadata = _create_metadata(
+ df,
+ schema_version=schema_version,
+ geometry_encoding=geometry_encoding_dict,
+ write_covering_bbox=write_covering_bbox,
+ )
+
+ if write_covering_bbox:
+ if "bbox" in df.columns:
+ raise ValueError(
+ "An existing column 'bbox' already exists in the dataframe. "
+ "Please rename to write covering bbox."
+ )
+ bounds = df.bounds
+ bbox_array = StructArray.from_arrays(
+ [bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
+ names=["xmin", "ymin", "xmax", "ymax"],
+ )
+ table = table.append_column("bbox", bbox_array)
+
+ # Store geopandas specific file-level metadata
+ # This must be done AFTER creating the table or it is not persisted
+ metadata = table.schema.metadata
+ metadata.update({b"geo": _encode_metadata(geo_metadata)})
+
+ return table.replace_schema_metadata(metadata)
+
+
+def _to_parquet(
+ df,
+ path,
+ index=None,
+ compression="snappy",
+ geometry_encoding="WKB",
+ schema_version=None,
+ write_covering_bbox=False,
+ **kwargs,
+):
"""
Write a GeoDataFrame to the Parquet format.
@@ -169,11 +424,22 @@ def _to_parquet(df, path, index=None, compression='snappy',
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
- pass
-
-
-def _to_feather(df, path, index=None, compression=None, schema_version=None,
- **kwargs):
+ parquet = import_optional_dependency(
+ "pyarrow.parquet", extra="pyarrow is required for Parquet support."
+ )
+
+ path = _expand_user(path)
+ table = _geopandas_to_arrow(
+ df,
+ index=index,
+ geometry_encoding=geometry_encoding,
+ schema_version=schema_version,
+ write_covering_bbox=write_covering_bbox,
+ )
+ parquet.write_table(table, path, compression=compression, **kwargs)
+
+
+def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
@@ -205,14 +471,88 @@ def _to_feather(df, path, index=None, compression=None, schema_version=None,
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
- pass
+ feather = import_optional_dependency(
+ "pyarrow.feather", extra="pyarrow is required for Feather support."
+ )
+ # TODO move this into `import_optional_dependency`
+ import pyarrow
+
+ if Version(pyarrow.__version__) < Version("0.17.0"):
+ raise ImportError("pyarrow >= 0.17 required for Feather support")
+
+ path = _expand_user(path)
+ table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
+ feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
- pass
+ if geo_metadata is None:
+ # Note: this path of not passing metadata is also used by dask-geopandas
+ geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
+
+ # Find all geometry columns that were read from the file. May
+ # be a subset if 'columns' parameter is used.
+ geometry_columns = [
+ col for col in geo_metadata["columns"] if col in table.column_names
+ ]
+ result_column_names = list(table.slice(0, 0).to_pandas().columns)
+ geometry_columns.sort(key=result_column_names.index)
+
+ if not len(geometry_columns):
+ raise ValueError(
+ """No geometry columns are included in the columns read from
+ the Parquet/Feather file. To read this file without geometry columns,
+ use pandas.read_parquet/read_feather() instead."""
+ )
+
+ geometry = geo_metadata["primary_column"]
+
+ # Missing geometry likely indicates a subset of columns was read;
+ # promote the first available geometry to the primary geometry.
+ if len(geometry_columns) and geometry not in geometry_columns:
+ geometry = geometry_columns[0]
+
+ # if there are multiple non-primary geometry columns, raise a warning
+ if len(geometry_columns) > 1:
+ warnings.warn(
+ "Multiple non-primary geometry columns read from Parquet/Feather "
+ "file. The first column read was promoted to the primary geometry.",
+ stacklevel=3,
+ )
+
+ table_attr = table.drop(geometry_columns)
+ df = table_attr.to_pandas()
+
+ # Convert the WKB columns that are present back to geometry.
+ for col in geometry_columns:
+ col_metadata = geo_metadata["columns"][col]
+ if "crs" in col_metadata:
+ crs = col_metadata["crs"]
+ if isinstance(crs, dict):
+ _remove_id_from_member_of_ensembles(crs)
+ else:
+ # per the GeoParquet spec, missing CRS is to be interpreted as
+ # OGC:CRS84
+ crs = "OGC:CRS84"
+
+ if col_metadata["encoding"] == "WKB":
+ geom_arr = from_wkb(np.array(table[col]), crs=crs)
+ else:
+ from geopandas.io._geoarrow import construct_shapely_array
+
+ geom_arr = from_shapely(
+ construct_shapely_array(
+ table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
+ ),
+ crs=crs,
+ )
+
+ df.insert(result_column_names.index(col), col, geom_arr)
+
+ return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
@@ -221,7 +561,36 @@ def _get_filesystem_path(path, filesystem=None, storage_options=None):
If the filesystem is not None then it's just returned as is.
"""
- pass
+ import pyarrow
+
+ if (
+ isinstance(path, str)
+ and storage_options is None
+ and filesystem is None
+ and Version(pyarrow.__version__) >= Version("5.0.0")
+ ):
+ # Use the native pyarrow filesystem if possible.
+ try:
+ from pyarrow.fs import FileSystem
+
+ filesystem, path = FileSystem.from_uri(path)
+ except Exception:
+ # fallback to use get_handle / fsspec for filesystems
+ # that pyarrow doesn't support
+ pass
+
+ if _is_fsspec_url(path) and filesystem is None:
+ fsspec = import_optional_dependency(
+ "fsspec", extra="fsspec is requred for 'storage_options'."
+ )
+ filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
+
+ if filesystem is None and storage_options:
+ raise ValueError(
+ "Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
+ )
+
+ return filesystem, path
def _ensure_arrow_fs(filesystem):
@@ -230,7 +599,38 @@ def _ensure_arrow_fs(filesystem):
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
- pass
+ from pyarrow import fs
+
+ if isinstance(filesystem, fs.FileSystem):
+ return filesystem
+
+ # handle fsspec-compatible filesystems
+ try:
+ import fsspec
+ except ImportError:
+ pass
+ else:
+ if isinstance(filesystem, fsspec.AbstractFileSystem):
+ return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
+
+ return filesystem
+
+
+def _validate_and_decode_metadata(metadata):
+ if metadata is None or b"geo" not in metadata:
+ raise ValueError(
+ """Missing geo metadata in Parquet/Feather file.
+ Use pandas.read_parquet/read_feather() instead."""
+ )
+
+ # check for malformed metadata
+ try:
+ decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
+ except (TypeError, json.decoder.JSONDecodeError):
+ raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
+
+ _validate_geo_metadata(decoded_geo_metadata)
+ return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
@@ -242,11 +642,33 @@ def _read_parquet_schema_and_metadata(path, filesystem):
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
- pass
+ import pyarrow
+ from pyarrow import parquet
+ kwargs = {}
+ if Version(pyarrow.__version__) < Version("15.0.0"):
+ kwargs = dict(use_legacy_dataset=False)
-def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs
- ):
+ try:
+ schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
+ except Exception:
+ schema = parquet.read_schema(path, filesystem=filesystem)
+
+ metadata = schema.metadata
+
+ # read metadata separately to get the raw Parquet FileMetaData metadata
+ # (pyarrow doesn't properly exposes those in schema.metadata for files
+ # created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
+ if metadata is None or b"geo" not in metadata:
+ try:
+ metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
+ except Exception:
+ pass
+
+ return schema, metadata
+
+
+def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
@@ -313,7 +735,48 @@ def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
- pass
+
+ parquet = import_optional_dependency(
+ "pyarrow.parquet", extra="pyarrow is required for Parquet support."
+ )
+ import geopandas.io._pyarrow_hotfix # noqa: F401
+
+ # TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
+ # adds filesystem as a keyword and match that.
+ filesystem = kwargs.pop("filesystem", None)
+ filesystem, path = _get_filesystem_path(
+ path, filesystem=filesystem, storage_options=storage_options
+ )
+ path = _expand_user(path)
+ schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
+
+ geo_metadata = _validate_and_decode_metadata(metadata)
+
+ bbox_filter = (
+ _get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
+ )
+
+ if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
+
+ # by default, bbox column is not read in, so must specify which
+ # columns are read in if it exists.
+ if not columns and if_bbox_column_exists:
+ columns = _get_non_bbox_columns(schema, geo_metadata)
+
+ # if both bbox and filters kwargs are used, must splice together.
+ if "filters" in kwargs:
+ filters_kwarg = kwargs.pop("filters")
+ filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
+ else:
+ filters = bbox_filter
+
+ kwargs["use_pandas_metadata"] = True
+
+ table = parquet.read_table(
+ path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
+ )
+
+ return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
@@ -367,4 +830,84 @@ def _read_feather(path, columns=None, **kwargs):
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
- pass
+
+ feather = import_optional_dependency(
+ "pyarrow.feather", extra="pyarrow is required for Feather support."
+ )
+ # TODO move this into `import_optional_dependency`
+ import pyarrow
+
+ import geopandas.io._pyarrow_hotfix # noqa: F401
+
+ if Version(pyarrow.__version__) < Version("0.17.0"):
+ raise ImportError("pyarrow >= 0.17 required for Feather support")
+
+ path = _expand_user(path)
+
+ table = feather.read_table(path, columns=columns, **kwargs)
+ return _arrow_to_geopandas(table)
+
+
+def _get_parquet_bbox_filter(geo_metadata, bbox):
+ primary_column = geo_metadata["primary_column"]
+
+ if _check_if_covering_in_geo_metadata(geo_metadata):
+ bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
+ return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
+
+ elif geo_metadata["columns"][primary_column]["encoding"] == "point":
+ import pyarrow.compute as pc
+
+ return (
+ (pc.field((primary_column, "x")) >= bbox[0])
+ & (pc.field((primary_column, "x")) <= bbox[2])
+ & (pc.field((primary_column, "y")) >= bbox[1])
+ & (pc.field((primary_column, "y")) <= bbox[3])
+ )
+
+ else:
+ raise ValueError(
+ "Specifying 'bbox' not supported for this Parquet file (it should either "
+ "have a bbox covering column or use 'point' encoding)."
+ )
+
+
+def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
+ import pyarrow.compute as pc
+
+ return ~(
+ (pc.field((bbox_column_name, "xmin")) > bbox[2])
+ | (pc.field((bbox_column_name, "ymin")) > bbox[3])
+ | (pc.field((bbox_column_name, "xmax")) < bbox[0])
+ | (pc.field((bbox_column_name, "ymax")) < bbox[1])
+ )
+
+
+def _check_if_covering_in_geo_metadata(geo_metadata):
+ primary_column = geo_metadata["primary_column"]
+ return "covering" in geo_metadata["columns"][primary_column].keys()
+
+
+def _get_bbox_encoding_column_name(geo_metadata):
+ primary_column = geo_metadata["primary_column"]
+ return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
+
+
+def _get_non_bbox_columns(schema, geo_metadata):
+
+ bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
+ columns = schema.names
+ if bbox_column_name in columns:
+ columns.remove(bbox_column_name)
+ return columns
+
+
+def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
+ parquet = import_optional_dependency(
+ "pyarrow.parquet", extra="pyarrow is required for Parquet support."
+ )
+ if bbox_filter is None:
+ return kwarg_filters
+
+ filters_expression = parquet.filters_to_expression(kwarg_filters)
+ return bbox_filter & filters_expression
diff --git a/geopandas/io/file.py b/geopandas/io/file.py
index 43101f2e..37aa3038 100644
--- a/geopandas/io/file.py
+++ b/geopandas/io/file.py
@@ -1,51 +1,195 @@
from __future__ import annotations
+
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
+
+# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
+
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
+
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
+
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
+
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
-_VALID_URLS.discard('')
-_VALID_URLS.discard('file')
+_VALID_URLS.discard("")
+# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
+_VALID_URLS.discard("file")
+
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
+
+
+def _import_fiona():
+ global fiona
+ global fiona_env
+ global fiona_import_error
+ global FIONA_GE_19
+
+ if fiona is None:
+ try:
+ import fiona
+
+ # only try to import fiona.Env if the main fiona import succeeded
+ # (otherwise you can get confusing "AttributeError: module 'fiona'
+ # has no attribute '_loading'" / partially initialized module errors)
+ try:
+ from fiona import Env as fiona_env
+ except ImportError:
+ try:
+ from fiona import drivers as fiona_env
+ except ImportError:
+ fiona_env = None
+
+ FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
+ "1.9.0"
+ )
+
+ except ImportError as err:
+ fiona = False
+ fiona_import_error = str(err)
+
+
pyogrio = None
pyogrio_import_error = None
-_EXTENSION_TO_DRIVER = {'.bna': 'BNA', '.dxf': 'DXF', '.csv': 'CSV', '.shp':
- 'ESRI Shapefile', '.dbf': 'ESRI Shapefile', '.json': 'GeoJSON',
- '.geojson': 'GeoJSON', '.geojsonl': 'GeoJSONSeq', '.geojsons':
- 'GeoJSONSeq', '.gpkg': 'GPKG', '.gml': 'GML', '.xml': 'GML', '.gpx':
- 'GPX', '.gtm': 'GPSTrackMaker', '.gtz': 'GPSTrackMaker', '.tab':
- 'MapInfo File', '.mif': 'MapInfo File', '.mid': 'MapInfo File', '.dgn':
- 'DGN', '.fgb': 'FlatGeobuf'}
+
+
+def _import_pyogrio():
+ global pyogrio
+ global pyogrio_import_error
+
+ if pyogrio is None:
+ try:
+ import pyogrio
+
+ except ImportError as err:
+ pyogrio = False
+ pyogrio_import_error = str(err)
+
+
+def _check_fiona(func):
+ if not fiona:
+ raise ImportError(
+ f"the {func} requires the 'fiona' package, but it is not installed or does "
+ f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
+ )
+
+
+def _check_pyogrio(func):
+ if not pyogrio:
+ raise ImportError(
+ f"the {func} requires the 'pyogrio' package, but it is not installed "
+ "or does not import correctly."
+ "\nImporting pyogrio resulted in: {pyogrio_import_error}"
+ )
+
+
+def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
+ if metadata is None:
+ return
+ if driver != "GPKG":
+ raise NotImplementedError(
+ "The 'metadata' keyword is only supported for the GPKG driver."
+ )
+
+ if engine == "fiona" and not FIONA_GE_19:
+ raise NotImplementedError(
+ "The 'metadata' keyword is only supported for Fiona >= 1.9."
+ )
+
+
+def _check_engine(engine, func):
+ # if not specified through keyword or option, then default to "pyogrio" if
+ # installed, otherwise try fiona
+ if engine is None:
+ import geopandas
+
+ engine = geopandas.options.io_engine
+
+ if engine is None:
+ _import_pyogrio()
+ if pyogrio:
+ engine = "pyogrio"
+ else:
+ _import_fiona()
+ if fiona:
+ engine = "fiona"
+
+ if engine == "pyogrio":
+ _import_pyogrio()
+ _check_pyogrio(func)
+ elif engine == "fiona":
+ _import_fiona()
+ _check_fiona(func)
+ elif engine is None:
+ raise ImportError(
+ f"The {func} requires the 'pyogrio' or 'fiona' package, "
+ "but neither is installed or imports correctly."
+ f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
+ f"\nImporting fiona resulted in: {fiona_import_error}"
+ )
+
+ return engine
+
+
+_EXTENSION_TO_DRIVER = {
+ ".bna": "BNA",
+ ".dxf": "DXF",
+ ".csv": "CSV",
+ ".shp": "ESRI Shapefile",
+ ".dbf": "ESRI Shapefile",
+ ".json": "GeoJSON",
+ ".geojson": "GeoJSON",
+ ".geojsonl": "GeoJSONSeq",
+ ".geojsons": "GeoJSONSeq",
+ ".gpkg": "GPKG",
+ ".gml": "GML",
+ ".xml": "GML",
+ ".gpx": "GPX",
+ ".gtm": "GPSTrackMaker",
+ ".gtz": "GPSTrackMaker",
+ ".tab": "MapInfo File",
+ ".mif": "MapInfo File",
+ ".mid": "MapInfo File",
+ ".dgn": "DGN",
+ ".fgb": "FlatGeobuf",
+}
def _expand_user(path):
"""Expand paths that use ~."""
- pass
+ if isinstance(path, str):
+ path = os.path.expanduser(path)
+ elif isinstance(path, Path):
+ path = path.expanduser()
+ return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
- pass
+ try:
+ return parse_url(url).scheme in _VALID_URLS
+ except Exception:
+ return False
-def _read_file(filename, bbox=None, mask=None, columns=None, rows=None,
- engine=None, **kwargs):
+def _read_file(
+ filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
+):
"""
Returns a GeoDataFrame from a file or URL.
@@ -131,18 +275,308 @@ def _read_file(filename, bbox=None, mask=None, columns=None, rows=None,
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
- pass
+ engine = _check_engine(engine, "'read_file' function")
+
+ filename = _expand_user(filename)
+
+ from_bytes = False
+ if _is_url(filename):
+ # if it is a url that supports random access -> pass through to
+ # pyogrio/fiona as is (to support downloading only part of the file)
+ # otherwise still download manually because pyogrio/fiona don't support
+ # all types of urls (https://github.com/geopandas/geopandas/issues/2908)
+ with urllib.request.urlopen(filename) as response:
+ if not response.headers.get("Accept-Ranges") == "bytes":
+ filename = response.read()
+ from_bytes = True
+
+ if engine == "pyogrio":
+ return _read_file_pyogrio(
+ filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
+ )
+
+ elif engine == "fiona":
+ if pd.api.types.is_file_like(filename):
+ data = filename.read()
+ path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
+ from_bytes = True
+ else:
+ path_or_bytes = filename
+
+ return _read_file_fiona(
+ path_or_bytes,
+ from_bytes,
+ bbox=bbox,
+ mask=mask,
+ columns=columns,
+ rows=rows,
+ **kwargs,
+ )
+
+ else:
+ raise ValueError(f"unknown engine '{engine}'")
+
+
+def _read_file_fiona(
+ path_or_bytes,
+ from_bytes,
+ bbox=None,
+ mask=None,
+ columns=None,
+ rows=None,
+ where=None,
+ **kwargs,
+):
+ if where is not None and not FIONA_GE_19:
+ raise NotImplementedError("where requires fiona 1.9+")
+
+ if columns is not None:
+ if "include_fields" in kwargs:
+ raise ValueError(
+ "Cannot specify both 'include_fields' and 'columns' keywords"
+ )
+ if not FIONA_GE_19:
+ raise NotImplementedError("'columns' keyword requires fiona 1.9+")
+ kwargs["include_fields"] = columns
+ elif "include_fields" in kwargs:
+ # alias to columns, as this variable is used below to specify column order
+ # in the dataframe creation
+ columns = kwargs["include_fields"]
+
+ if not from_bytes:
+ # Opening a file via URL or file-like-object above automatically detects a
+ # zipped file. In order to match that behavior, attempt to add a zip scheme
+ # if missing.
+ path_or_bytes = vsi_path(str(path_or_bytes))
+
+ if from_bytes:
+ reader = fiona.BytesCollection
+ else:
+ reader = fiona.open
+
+ with fiona_env():
+ with reader(path_or_bytes, **kwargs) as features:
+ crs = features.crs_wkt
+ # attempt to get EPSG code
+ try:
+ # fiona 1.9+
+ epsg = features.crs.to_epsg(confidence_threshold=100)
+ if epsg is not None:
+ crs = epsg
+ except AttributeError:
+ # fiona <= 1.8
+ try:
+ crs = features.crs["init"]
+ except (TypeError, KeyError):
+ pass
+
+ # handle loading the bounding box
+ if bbox is not None:
+ if isinstance(bbox, (GeoDataFrame, GeoSeries)):
+ bbox = tuple(bbox.to_crs(crs).total_bounds)
+ elif isinstance(bbox, BaseGeometry):
+ bbox = bbox.bounds
+ assert len(bbox) == 4
+ # handle loading the mask
+ elif isinstance(mask, (GeoDataFrame, GeoSeries)):
+ mask = mapping(mask.to_crs(crs).union_all())
+ elif isinstance(mask, BaseGeometry):
+ mask = mapping(mask)
+
+ filters = {}
+ if bbox is not None:
+ filters["bbox"] = bbox
+ if mask is not None:
+ filters["mask"] = mask
+ if where is not None:
+ filters["where"] = where
+
+ # setup the data loading filter
+ if rows is not None:
+ if isinstance(rows, int):
+ rows = slice(rows)
+ elif not isinstance(rows, slice):
+ raise TypeError("'rows' must be an integer or a slice.")
+ f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
+ elif filters:
+ f_filt = features.filter(**filters)
+ else:
+ f_filt = features
+ # get list of columns
+ columns = columns or list(features.schema["properties"])
+ datetime_fields = [
+ k for (k, v) in features.schema["properties"].items() if v == "datetime"
+ ]
+ if (
+ kwargs.get("ignore_geometry", False)
+ or features.schema["geometry"] == "None"
+ ):
+ df = pd.DataFrame(
+ [record["properties"] for record in f_filt], columns=columns
+ )
+ else:
+ df = GeoDataFrame.from_features(
+ f_filt, crs=crs, columns=columns + ["geometry"]
+ )
+ for k in datetime_fields:
+ as_dt = None
+ # plain try catch for when pandas will raise in the future
+ # TODO we can tighten the exception type in future when it does
+ try:
+ with warnings.catch_warnings():
+ # pandas 2.x does not yet enforce this behaviour but raises a
+ # warning -> we want to to suppress this warning for our users,
+ # and do this by turning it into an error so we take the
+ # `except` code path to try again with utc=True
+ warnings.filterwarnings(
+ "error",
+ "In a future version of pandas, parsing datetimes with "
+ "mixed time zones will raise an error",
+ FutureWarning,
+ )
+ as_dt = pd.to_datetime(df[k])
+ except Exception:
+ pass
+ if as_dt is None or as_dt.dtype == "object":
+ # if to_datetime failed, try again for mixed timezone offsets
+ # This can still fail if there are invalid datetimes
+ try:
+ as_dt = pd.to_datetime(df[k], utc=True)
+ except Exception:
+ pass
+ # if to_datetime succeeded, round datetimes as
+ # fiona only supports up to ms precision (any microseconds are
+ # floating point rounding error)
+ if as_dt is not None and not (as_dt.dtype == "object"):
+ if PANDAS_GE_20:
+ df[k] = as_dt.dt.as_unit("ms")
+ else:
+ df[k] = as_dt.dt.round(freq="ms")
+ return df
+
+
+def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
+ import pyogrio
+
+ if rows is not None:
+ if isinstance(rows, int):
+ kwargs["max_features"] = rows
+ elif isinstance(rows, slice):
+ if rows.start is not None:
+ if rows.start < 0:
+ raise ValueError(
+ "Negative slice start not supported with the 'pyogrio' engine."
+ )
+ kwargs["skip_features"] = rows.start
+ if rows.stop is not None:
+ kwargs["max_features"] = rows.stop - (rows.start or 0)
+ if rows.step is not None:
+ raise ValueError("slice with step is not supported")
+ else:
+ raise TypeError("'rows' must be an integer or a slice.")
+
+ if bbox is not None and mask is not None:
+ # match error message from Fiona
+ raise ValueError("mask and bbox can not be set together")
+
+ if bbox is not None:
+ if isinstance(bbox, (GeoDataFrame, GeoSeries)):
+ crs = pyogrio.read_info(path_or_bytes).get("crs")
+ if isinstance(path_or_bytes, IOBase):
+ path_or_bytes.seek(0)
+
+ bbox = tuple(bbox.to_crs(crs).total_bounds)
+ elif isinstance(bbox, BaseGeometry):
+ bbox = bbox.bounds
+ if len(bbox) != 4:
+ raise ValueError("'bbox' should be a length-4 tuple.")
+
+ if mask is not None:
+ # NOTE: mask cannot be used at same time as bbox keyword
+ if isinstance(mask, (GeoDataFrame, GeoSeries)):
+ crs = pyogrio.read_info(path_or_bytes).get("crs")
+ if isinstance(path_or_bytes, IOBase):
+ path_or_bytes.seek(0)
+
+ mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
+ elif isinstance(mask, BaseGeometry):
+ mask = shapely.unary_union(mask)
+ elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
+ # convert GeoJSON to shapely geometry
+ mask = shapely.geometry.shape(mask)
+
+ kwargs["mask"] = mask
+
+ if kwargs.pop("ignore_geometry", False):
+ kwargs["read_geometry"] = False
+
+ # translate `ignore_fields`/`include_fields` keyword for back compat with fiona
+ if "ignore_fields" in kwargs and "include_fields" in kwargs:
+ raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
+ elif "ignore_fields" in kwargs:
+ if kwargs.get("columns", None) is not None:
+ raise ValueError(
+ "Cannot specify both 'columns' and 'ignore_fields' keywords"
+ )
+ warnings.warn(
+ "The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
+ "will be removed in a future release. You can use the 'columns' keyword "
+ "instead to select which columns to read.",
+ DeprecationWarning,
+ stacklevel=3,
+ )
+ ignore_fields = kwargs.pop("ignore_fields")
+ fields = pyogrio.read_info(path_or_bytes)["fields"]
+ include_fields = [col for col in fields if col not in ignore_fields]
+ kwargs["columns"] = include_fields
+ elif "include_fields" in kwargs:
+ # translate `include_fields` keyword for back compat with fiona engine
+ if kwargs.get("columns", None) is not None:
+ raise ValueError(
+ "Cannot specify both 'columns' and 'include_fields' keywords"
+ )
+ warnings.warn(
+ "The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
+ "will be removed in a future release. You can use the 'columns' keyword "
+ "instead to select which columns to read.",
+ DeprecationWarning,
+ stacklevel=3,
+ )
+ kwargs["columns"] = kwargs.pop("include_fields")
+
+ return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
- pass
-
-
-def _to_file(df, filename, driver=None, schema=None, index=None, mode='w',
- crs=None, engine=None, metadata=None, **kwargs):
+ try:
+ # in case the path is a file handle
+ path = path.name
+ except AttributeError:
+ pass
+ try:
+ return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
+ except KeyError:
+ # Assume it is a shapefile folder for now. In the future,
+ # will likely raise an exception when the expected
+ # folder writing behavior is more clearly defined.
+ return "ESRI Shapefile"
+
+
+def _to_file(
+ df,
+ filename,
+ driver=None,
+ schema=None,
+ index=None,
+ mode="w",
+ crs=None,
+ engine=None,
+ metadata=None,
+ **kwargs,
+):
"""
Write this GeoDataFrame to an OGR data source
@@ -214,17 +648,179 @@ def _to_file(df, filename, driver=None, schema=None, index=None, mode='w',
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
- pass
+ engine = _check_engine(engine, "'to_file' method")
+
+ filename = _expand_user(filename)
+
+ if index is None:
+ # Determine if index attribute(s) should be saved to file
+ # (only if they are named or are non-integer)
+ index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
+ if index:
+ df = df.reset_index(drop=False)
+
+ if driver is None:
+ driver = _detect_driver(filename)
+
+ if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
+ warnings.warn(
+ "Column names longer than 10 characters will be truncated when saved to "
+ "ESRI Shapefile.",
+ stacklevel=3,
+ )
+
+ if (df.dtypes == "geometry").sum() > 1:
+ raise ValueError(
+ "GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
+ "supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
+ "GeoDataFrame.to_feather, drop additional geometry columns or convert them "
+ "to a supported format like a well-known text (WKT) using "
+ "`GeoSeries.to_wkt()`.",
+ )
+ _check_metadata_supported(metadata, engine, driver)
+
+ if mode not in ("w", "a"):
+ raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
+
+ if engine == "pyogrio":
+ _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
+ elif engine == "fiona":
+ _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
+ else:
+ raise ValueError(f"unknown engine '{engine}'")
+
+
+def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
+ if not HAS_PYPROJ and crs:
+ raise ImportError(
+ "The 'pyproj' package is required to write a file with a CRS, but it is not"
+ " installed or does not import correctly."
+ )
+
+ if schema is None:
+ schema = infer_schema(df)
+
+ if crs:
+ from pyproj import CRS
+
+ crs = CRS.from_user_input(crs)
+ else:
+ crs = df.crs
+
+ with fiona_env():
+ crs_wkt = None
+ try:
+ gdal_version = Version(
+ fiona.env.get_gdal_release_name().strip("e")
+ ) # GH3147
+ except (AttributeError, ValueError):
+ gdal_version = Version("2.0.0") # just assume it is not the latest
+ if gdal_version >= Version("3.0.0") and crs:
+ crs_wkt = crs.to_wkt()
+ elif crs:
+ crs_wkt = crs.to_wkt("WKT1_GDAL")
+ with fiona.open(
+ filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
+ ) as colxn:
+ if metadata is not None:
+ colxn.update_tags(metadata)
+ colxn.writerecords(df.iterfeatures())
+
+
+def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
+ import pyogrio
+
+ if schema is not None:
+ raise ValueError(
+ "The 'schema' argument is not supported with the 'pyogrio' engine."
+ )
+
+ if mode == "a":
+ kwargs["append"] = True
+
+ if crs is not None:
+ raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
+
+ # for the fiona engine, this check is done in gdf.iterfeatures()
+ if not df.columns.is_unique:
+ raise ValueError("GeoDataFrame cannot contain duplicated column names.")
+
+ pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
+
+
+def infer_schema(df):
+ from collections import OrderedDict
+
+ # TODO: test pandas string type and boolean type once released
+ types = {
+ "Int32": "int32",
+ "int32": "int32",
+ "Int64": "int",
+ "string": "str",
+ "boolean": "bool",
+ }
+
+ def convert_type(column, in_type):
+ if in_type == object:
+ return "str"
+ if in_type.name.startswith("datetime64"):
+ # numpy datetime type regardless of frequency
+ return "datetime"
+ if str(in_type) in types:
+ out_type = types[str(in_type)]
+ else:
+ out_type = type(np.zeros(1, in_type).item()).__name__
+ if out_type == "long":
+ out_type = "int"
+ return out_type
+
+ properties = OrderedDict(
+ [
+ (col, convert_type(col, _type))
+ for col, _type in zip(df.columns, df.dtypes)
+ if col != df._geometry_column_name
+ ]
+ )
+
+ if df.empty:
+ warnings.warn(
+ "You are attempting to write an empty DataFrame to file. "
+ "For some drivers, this operation may fail.",
+ UserWarning,
+ stacklevel=3,
+ )
+
+ # Since https://github.com/Toblerity/Fiona/issues/446 resolution,
+ # Fiona allows a list of geometry types
+ geom_types = _geometry_types(df)
+
+ schema = {"geometry": geom_types, "properties": properties}
+
+ return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
- pass
+ geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
+ geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
+ geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
+ geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
+ geom_types = geom_types_3D + geom_types_2D
+
+ if len(geom_types) == 0:
+ # Default geometry type supported by Fiona
+ # (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
+ return "Unknown"
+
+ if len(geom_types) == 1:
+ geom_types = geom_types[0]
+ return geom_types
-def _list_layers(filename) ->pd.DataFrame:
+
+def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
@@ -245,4 +841,11 @@ def _list_layers(filename) ->pd.DataFrame:
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
- pass
+ _import_pyogrio()
+ _check_pyogrio("list_layers")
+
+ import pyogrio
+
+ return pd.DataFrame(
+ pyogrio.list_layers(filename), columns=["name", "geometry_type"]
+ )
diff --git a/geopandas/io/sql.py b/geopandas/io/sql.py
index 12554611..0f99b09e 100644
--- a/geopandas/io/sql.py
+++ b/geopandas/io/sql.py
@@ -1,9 +1,12 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
+
import pandas as pd
+
import shapely
import shapely.wkb
+
from geopandas import GeoDataFrame
@@ -24,10 +27,22 @@ def _get_conn(conn_or_engine):
-------
Connection
"""
- pass
+ from sqlalchemy.engine.base import Connection, Engine
+ if isinstance(conn_or_engine, Connection):
+ if not conn_or_engine.in_transaction():
+ with conn_or_engine.begin():
+ yield conn_or_engine
+ else:
+ yield conn_or_engine
+ elif isinstance(conn_or_engine, Engine):
+ with conn_or_engine.begin() as conn:
+ yield conn
+ else:
+ raise ValueError(f"Unknown Connectable: {conn_or_engine}")
-def _df_to_geodf(df, geom_col='geom', crs=None, con=None):
+
+def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
@@ -50,11 +65,73 @@ def _df_to_geodf(df, geom_col='geom', crs=None, con=None):
-------
GeoDataFrame
"""
- pass
+ if geom_col not in df:
+ raise ValueError("Query missing geometry column '{}'".format(geom_col))
+
+ if df.columns.to_list().count(geom_col) > 1:
+ raise ValueError(
+ f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
+ "one geometry column is allowed."
+ )
+
+ geoms = df[geom_col].dropna()
+
+ if not geoms.empty:
+ load_geom_bytes = shapely.wkb.loads
+ """Load from Python 3 binary."""
+
+ def load_geom_text(x):
+ """Load from binary encoded as text."""
+ return shapely.wkb.loads(str(x), hex=True)
+
+ if isinstance(geoms.iat[0], bytes):
+ load_geom = load_geom_bytes
+ else:
+ load_geom = load_geom_text
+
+ df[geom_col] = geoms = geoms.apply(load_geom)
+ if crs is None:
+ srid = shapely.get_srid(geoms.iat[0])
+ # if no defined SRID in geodatabase, returns SRID of 0
+ if srid != 0:
+ try:
+ spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
+ except pd.errors.DatabaseError:
+ warning_msg = (
+ f"Could not find the spatial reference system table "
+ f"(spatial_ref_sys) in PostGIS."
+ f"Trying epsg:{srid} as a fallback."
+ )
+ warnings.warn(warning_msg, UserWarning, stacklevel=3)
+ crs = "epsg:{}".format(srid)
+ else:
+ if not spatial_ref_sys_df.empty:
+ auth_name = spatial_ref_sys_df["auth_name"].item()
+ crs = f"{auth_name}:{srid}"
+ else:
+ warning_msg = (
+ f"Could not find srid {srid} in the "
+ f"spatial_ref_sys table. "
+ f"Trying epsg:{srid} as a fallback."
+ )
+ warnings.warn(warning_msg, UserWarning, stacklevel=3)
+ crs = "epsg:{}".format(srid)
+
+ return GeoDataFrame(df, crs=crs, geometry=geom_col)
-def _read_postgis(sql, con, geom_col='geom', crs=None, index_col=None,
- coerce_float=True, parse_dates=None, params=None, chunksize=None):
+
+def _read_postgis(
+ sql,
+ con,
+ geom_col="geom",
+ crs=None,
+ index_col=None,
+ coerce_float=True,
+ parse_dates=None,
+ params=None,
+ chunksize=None,
+):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
@@ -102,7 +179,34 @@ def _read_postgis(sql, con, geom_col='geom', crs=None, index_col=None,
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
- pass
+
+ if chunksize is None:
+ # read all in one chunk and return a single GeoDataFrame
+ df = pd.read_sql(
+ sql,
+ con,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ params=params,
+ chunksize=chunksize,
+ )
+ return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
+
+ else:
+ # read data in chunks and return a generator
+ df_generator = pd.read_sql(
+ sql,
+ con,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ params=params,
+ chunksize=chunksize,
+ )
+ return (
+ _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
+ )
def _get_geometry_type(gdf):
@@ -124,23 +228,131 @@ def _get_geometry_type(gdf):
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
- pass
+ geom_types = list(gdf.geometry.geom_type.unique())
+ has_curve = False
+
+ for gt in geom_types:
+ if gt is None:
+ continue
+ elif "LinearRing" in gt:
+ has_curve = True
+
+ if len(geom_types) == 1:
+ if has_curve:
+ target_geom_type = "LINESTRING"
+ else:
+ if geom_types[0] is None:
+ raise ValueError("No valid geometries in the data.")
+ else:
+ target_geom_type = geom_types[0].upper()
+ else:
+ target_geom_type = "GEOMETRY"
+
+ # Check for 3D-coordinates
+ if any(gdf.geometry.has_z):
+ target_geom_type += "Z"
+
+ return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return 0.
"""
- pass
+
+ # Use geoalchemy2 default for srid
+ # Note: undefined srid in PostGIS is 0
+ srid = None
+ warning_msg = (
+ "Could not parse CRS from the GeoDataFrame. "
+ "Inserting data without defined CRS."
+ )
+ if gdf.crs is not None:
+ try:
+ for confidence in (100, 70, 25):
+ srid = gdf.crs.to_epsg(min_confidence=confidence)
+ if srid is not None:
+ break
+ auth_srid = gdf.crs.to_authority(
+ auth_name="ESRI", min_confidence=confidence
+ )
+ if auth_srid is not None:
+ srid = int(auth_srid[1])
+ break
+ except Exception:
+ warnings.warn(warning_msg, UserWarning, stacklevel=2)
+
+ if srid is None:
+ srid = 0
+ warnings.warn(warning_msg, UserWarning, stacklevel=2)
+
+ return srid
+
+
+def _convert_linearring_to_linestring(gdf, geom_name):
+ from shapely.geometry import LineString
+
+ # Todo: Use shapely function once it's implemented:
+ # https://github.com/shapely/shapely/issues/1617
+
+ mask = gdf.geom_type == "LinearRing"
+ gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
+ lambda geom: LineString(geom)
+ )
+ return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
- pass
+ geoms = shapely.to_wkb(
+ shapely.set_srid(gdf[geom_name].values._data, srid=srid),
+ hex=True,
+ include_srid=True,
+ )
+
+ # The gdf will warn that the geometry column doesn't hold in-memory geometries
+ # now that they are EWKB, so convert back to a regular dataframe to avoid warning
+ # the user that the dtypes are unexpected.
+ df = pd.DataFrame(gdf, copy=False)
+ df[geom_name] = geoms
+ return df
+
+
+def _psql_insert_copy(tbl, conn, keys, data_iter):
+ import csv
+ import io
+ s_buf = io.StringIO()
+ writer = csv.writer(s_buf)
+ writer.writerows(data_iter)
+ s_buf.seek(0)
-def _write_postgis(gdf, name, con, schema=None, if_exists='fail', index=
- False, index_label=None, chunksize=None, dtype=None):
+ columns = ", ".join('"{}"'.format(k) for k in keys)
+
+ dbapi_conn = conn.connection
+ sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
+ tbl.table.schema, tbl.table.name, columns
+ )
+ with dbapi_conn.cursor() as cur:
+ # Use psycopg method if it's available
+ if hasattr(cur, "copy") and callable(cur.copy):
+ with cur.copy(sql) as copy:
+ copy.write(s_buf.read())
+ else: # otherwise use psycopg2 method
+ cur.copy_expert(sql, s_buf)
+
+
+def _write_postgis(
+ gdf,
+ name,
+ con,
+ schema=None,
+ if_exists="fail",
+ index=False,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+):
"""
Upload GeoDataFrame into PostGIS database.
@@ -180,7 +392,82 @@ def _write_postgis(gdf, name, con, schema=None, if_exists='fail', index=
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
- >>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432/mydatabase";) # doctest: +SKIP
+ >>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
+/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
- pass
+ try:
+ from geoalchemy2 import Geometry
+ from sqlalchemy import text
+ except ImportError:
+ raise ImportError("'to_postgis()' requires geoalchemy2 package.")
+
+ gdf = gdf.copy()
+ geom_name = gdf.geometry.name
+
+ # Get srid
+ srid = _get_srid_from_crs(gdf)
+
+ # Get geometry type and info whether data contains LinearRing.
+ geometry_type, has_curve = _get_geometry_type(gdf)
+
+ # Build dtype with Geometry
+ if dtype is not None:
+ dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
+ else:
+ dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
+
+ # Convert LinearRing geometries to LineString
+ if has_curve:
+ gdf = _convert_linearring_to_linestring(gdf, geom_name)
+
+ # Convert geometries to EWKB
+ gdf = _convert_to_ewkb(gdf, geom_name, srid)
+
+ if schema is not None:
+ schema_name = schema
+ else:
+ schema_name = "public"
+
+ if if_exists == "append":
+ # Check that the geometry srid matches with the current GeoDataFrame
+ with _get_conn(con) as connection:
+ # Only check SRID if table exists
+ if connection.dialect.has_table(connection, name, schema):
+ target_srid = connection.execute(
+ text(
+ "SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
+ schema=schema_name, table=name, geom_col=geom_name
+ )
+ )
+ ).fetchone()[0]
+
+ if target_srid != srid:
+ msg = (
+ "The CRS of the target table (EPSG:{epsg_t}) differs from the "
+ "CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
+ epsg_t=target_srid, epsg_src=srid
+ )
+ )
+ raise ValueError(msg)
+
+ with _get_conn(con) as connection:
+ gdf.to_sql(
+ name,
+ connection,
+ schema=schema_name,
+ if_exists=if_exists,
+ index=index,
+ index_label=index_label,
+ chunksize=chunksize,
+ dtype=dtype,
+ method=_psql_insert_copy,
+ )
+
+
+@lru_cache
+def _get_spatial_ref_sys_df(con, srid):
+ spatial_ref_sys_sql = (
+ f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
+ )
+ return pd.read_sql(spatial_ref_sys_sql, con)
diff --git a/geopandas/io/tests/generate_legacy_storage_files.py b/geopandas/io/tests/generate_legacy_storage_files.py
index 9b4e0426..fb6e136f 100644
--- a/geopandas/io/tests/generate_legacy_storage_files.py
+++ b/geopandas/io/tests/generate_legacy_storage_files.py
@@ -6,7 +6,8 @@ Based on pandas' generate_legacy_storage_files.py script.
To use this script, create an environment for which you want to
generate pickles, activate the environment, and run this script as:
-$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py geopandas/geopandas/io/tests/data/pickle/ pickle
+$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
+ geopandas/geopandas/io/tests/data/pickle/ pickle
This script generates a storage file for the current arch, system,
@@ -18,19 +19,82 @@ pickles and test versus the current data that is generated
(with master). These are then compared.
"""
+
import os
import pickle
import platform
import sys
+
import pandas as pd
+
from shapely.geometry import Point
+
import geopandas
def create_pickle_data():
"""create the pickle data"""
- pass
+
+ # custom geometry column name
+ gdf_the_geom = geopandas.GeoDataFrame(
+ {"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
+ geometry="the_geom",
+ )
+
+ # with crs
+ gdf_crs = geopandas.GeoDataFrame(
+ {"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
+ crs="EPSG:4326",
+ )
+
+ return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
+
+
+def platform_name():
+ return "_".join(
+ [
+ str(geopandas.__version__),
+ "pd-" + str(pd.__version__),
+ "py-" + str(platform.python_version()),
+ str(platform.machine()),
+ str(platform.system().lower()),
+ ]
+ )
+
+
+def write_legacy_pickles(output_dir):
+ print(
+ "This script generates a storage file for the current arch, system, "
+ "and python version"
+ )
+ print("geopandas version: {}").format(geopandas.__version__)
+ print(" output dir : {}".format(output_dir))
+ print(" storage format: pickle")
+
+ pth = "{}.pickle".format(platform_name())
+
+ fh = open(os.path.join(output_dir, pth), "wb")
+ pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
+ fh.close()
+
+ print("created pickle file: {}".format(pth))
+
+
+def main():
+ if len(sys.argv) != 3:
+ sys.exit(
+ "Specify output directory and storage type: generate_legacy_"
+ "storage_files.py <output_dir> <storage_type> "
+ )
+
+ output_dir = str(sys.argv[1])
+ storage_type = str(sys.argv[2])
+
+ if storage_type == "pickle":
+ write_legacy_pickles(output_dir=output_dir)
+ else:
+ sys.exit("storage_type must be one of {'pickle'}")
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/geopandas/io/tests/test_arrow.py b/geopandas/io/tests/test_arrow.py
index a5dbeb1b..9078191d 100644
--- a/geopandas/io/tests/test_arrow.py
+++ b/geopandas/io/tests/test_arrow.py
@@ -1,81 +1,552 @@
from __future__ import absolute_import
+
import json
import os
import pathlib
from itertools import product
from packaging.version import Version
+
import numpy as np
from pandas import DataFrame
from pandas import read_parquet as pd_read_parquet
+
import shapely
from shapely.geometry import LineString, MultiPolygon, Point, Polygon, box
+
import geopandas
from geopandas import GeoDataFrame, read_feather, read_file, read_parquet
from geopandas._compat import HAS_PYPROJ
from geopandas.array import to_wkb
-from geopandas.io.arrow import METADATA_VERSION, SUPPORTED_VERSIONS, _convert_bbox_to_parquet_filter, _create_metadata, _decode_metadata, _encode_metadata, _geopandas_to_arrow, _get_filesystem_path, _remove_id_from_member_of_ensembles, _validate_dataframe, _validate_geo_metadata
+from geopandas.io.arrow import (
+ METADATA_VERSION,
+ SUPPORTED_VERSIONS,
+ _convert_bbox_to_parquet_filter,
+ _create_metadata,
+ _decode_metadata,
+ _encode_metadata,
+ _geopandas_to_arrow,
+ _get_filesystem_path,
+ _remove_id_from_member_of_ensembles,
+ _validate_dataframe,
+ _validate_geo_metadata,
+)
+
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from geopandas.tests.util import mock
from pandas.testing import assert_frame_equal
-DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / 'data'
-pyarrow = pytest.importorskip('pyarrow')
+
+DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
+
+
+# Skip all tests in this module if pyarrow is not available
+pyarrow = pytest.importorskip("pyarrow")
+
import pyarrow.compute as pc
import pyarrow.parquet as pq
from pyarrow import feather
-@pytest.mark.parametrize('test_dataset', ['naturalearth_lowres',
- 'naturalearth_cities', 'nybb_filename'])
+@pytest.fixture(
+ params=[
+ "parquet",
+ pytest.param(
+ "feather",
+ marks=pytest.mark.skipif(
+ Version(pyarrow.__version__) < Version("0.17.0"),
+ reason="needs pyarrow >= 0.17",
+ ),
+ ),
+ ]
+)
+def file_format(request):
+ if request.param == "parquet":
+ return read_parquet, GeoDataFrame.to_parquet
+ elif request.param == "feather":
+ return read_feather, GeoDataFrame.to_feather
+
+
+def test_create_metadata(naturalearth_lowres):
+ df = read_file(naturalearth_lowres)
+ metadata = _create_metadata(df, geometry_encoding={"geometry": "WKB"})
+
+ assert isinstance(metadata, dict)
+ assert metadata["version"] == METADATA_VERSION
+ assert metadata["primary_column"] == "geometry"
+ assert "geometry" in metadata["columns"]
+ if HAS_PYPROJ:
+ crs_expected = df.crs.to_json_dict()
+ _remove_id_from_member_of_ensembles(crs_expected)
+ assert metadata["columns"]["geometry"]["crs"] == crs_expected
+ assert metadata["columns"]["geometry"]["encoding"] == "WKB"
+ assert metadata["columns"]["geometry"]["geometry_types"] == [
+ "MultiPolygon",
+ "Polygon",
+ ]
+
+ assert np.array_equal(
+ metadata["columns"]["geometry"]["bbox"], df.geometry.total_bounds
+ )
+
+ assert metadata["creator"]["library"] == "geopandas"
+ assert metadata["creator"]["version"] == geopandas.__version__
+
+ # specifying non-WKB encoding sets default schema to 1.1.0
+ metadata = _create_metadata(df, geometry_encoding={"geometry": "point"})
+ assert metadata["version"] == "1.1.0"
+
+
+def test_create_metadata_with_z_geometries():
+ geometry_types = [
+ "Point",
+ "Point Z",
+ "LineString",
+ "LineString Z",
+ "Polygon",
+ "Polygon Z",
+ "MultiPolygon",
+ "MultiPolygon Z",
+ ]
+ df = geopandas.GeoDataFrame(
+ {
+ "geo_type": geometry_types,
+ "geometry": [
+ Point(1, 2),
+ Point(1, 2, 3),
+ LineString([(0, 0), (1, 1), (2, 2)]),
+ LineString([(0, 0, 1), (1, 1, 2), (2, 2, 3)]),
+ Polygon([(0, 0), (0, 1), (1, 1), (1, 0)]),
+ Polygon([(0, 0, 0), (0, 1, 0.5), (1, 1, 1), (1, 0, 0.5)]),
+ MultiPolygon(
+ [
+ Polygon([(0, 0), (0, 1), (1, 1), (1, 0)]),
+ Polygon([(0.5, 0.5), (0.5, 1.5), (1.5, 1.5), (1.5, 0.5)]),
+ ]
+ ),
+ MultiPolygon(
+ [
+ Polygon([(0, 0, 0), (0, 1, 0.5), (1, 1, 1), (1, 0, 0.5)]),
+ Polygon(
+ [
+ (0.5, 0.5, 1),
+ (0.5, 1.5, 1.5),
+ (1.5, 1.5, 2),
+ (1.5, 0.5, 1.5),
+ ]
+ ),
+ ]
+ ),
+ ],
+ },
+ )
+ metadata = _create_metadata(df, geometry_encoding={"geometry": "WKB"})
+ assert sorted(metadata["columns"]["geometry"]["geometry_types"]) == sorted(
+ geometry_types
+ )
+ # only 3D geometries
+ metadata = _create_metadata(df.iloc[1::2], geometry_encoding={"geometry": "WKB"})
+ assert all(
+ geom_type.endswith(" Z")
+ for geom_type in metadata["columns"]["geometry"]["geometry_types"]
+ )
+
+ metadata = _create_metadata(df.iloc[5:7], geometry_encoding={"geometry": "WKB"})
+ assert metadata["columns"]["geometry"]["geometry_types"] == [
+ "MultiPolygon",
+ "Polygon Z",
+ ]
+
+
+def test_crs_metadata_datum_ensemble():
+ pyproj = pytest.importorskip("pyproj")
+ # compatibility for older PROJ versions using PROJJSON with datum ensembles
+ # https://github.com/geopandas/geopandas/pull/2453
+ crs = pyproj.CRS("EPSG:4326")
+ crs_json = crs.to_json_dict()
+ check_ensemble = False
+ if "datum_ensemble" in crs_json:
+ # older version of PROJ don't yet have datum ensembles
+ check_ensemble = True
+ assert "id" in crs_json["datum_ensemble"]["members"][0]
+ _remove_id_from_member_of_ensembles(crs_json)
+ if check_ensemble:
+ assert "id" not in crs_json["datum_ensemble"]["members"][0]
+ # ensure roundtrip still results in an equivalent CRS
+ assert pyproj.CRS(crs_json) == crs
+
+
+def test_write_metadata_invalid_spec_version(tmp_path):
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
+ with pytest.raises(ValueError, match="schema_version must be one of"):
+ _create_metadata(gdf, schema_version="invalid")
+
+ with pytest.raises(
+ ValueError,
+ match="'geoarrow' encoding is only supported with schema version >= 1.1.0",
+ ):
+ gdf.to_parquet(tmp_path, schema_version="1.0.0", geometry_encoding="geoarrow")
+
+
+def test_encode_metadata():
+ metadata = {"a": "b"}
+
+ expected = b'{"a": "b"}'
+ assert _encode_metadata(metadata) == expected
+
+
+def test_decode_metadata():
+ metadata_str = b'{"a": "b"}'
+
+ expected = {"a": "b"}
+ assert _decode_metadata(metadata_str) == expected
+
+ assert _decode_metadata(None) is None
+
+
+def test_validate_dataframe(naturalearth_lowres):
+ df = read_file(naturalearth_lowres)
+
+ # valid: should not raise ValueError
+ _validate_dataframe(df)
+ _validate_dataframe(df.set_index("iso_a3"))
+
+ # add column with non-string type
+ df[0] = 1
+
+ # invalid: should raise ValueError
+ with pytest.raises(ValueError):
+ _validate_dataframe(df)
+
+ with pytest.raises(ValueError):
+ _validate_dataframe(df.set_index(0))
+
+ # not a DataFrame: should raise ValueError
+ with pytest.raises(ValueError):
+ _validate_dataframe("not a dataframe")
+
+
+def test_validate_geo_metadata_valid():
+ _validate_geo_metadata(
+ {
+ "primary_column": "geometry",
+ "columns": {"geometry": {"crs": None, "encoding": "WKB"}},
+ "schema_version": "0.1.0",
+ }
+ )
+
+ _validate_geo_metadata(
+ {
+ "primary_column": "geometry",
+ "columns": {"geometry": {"crs": None, "encoding": "WKB"}},
+ "version": "<version>",
+ }
+ )
+
+ _validate_geo_metadata(
+ {
+ "primary_column": "geometry",
+ "columns": {
+ "geometry": {
+ "crs": {
+ # truncated PROJJSON for testing, as PROJJSON contents
+ # not validated here
+ "id": {"authority": "EPSG", "code": 4326},
+ },
+ "encoding": "point",
+ }
+ },
+ "version": "0.4.0",
+ }
+ )
+
+
+@pytest.mark.parametrize(
+ "metadata,error",
+ [
+ (None, "Missing or malformed geo metadata in Parquet/Feather file"),
+ ({}, "Missing or malformed geo metadata in Parquet/Feather file"),
+ # missing "version" key:
+ (
+ {"primary_column": "foo", "columns": None},
+ "'geo' metadata in Parquet/Feather file is missing required key",
+ ),
+ # missing "columns" key:
+ (
+ {"primary_column": "foo", "version": "<version>"},
+ "'geo' metadata in Parquet/Feather file is missing required key:",
+ ),
+ # missing "primary_column"
+ (
+ {"columns": [], "version": "<version>"},
+ "'geo' metadata in Parquet/Feather file is missing required key:",
+ ),
+ (
+ {"primary_column": "foo", "columns": [], "version": "<version>"},
+ "'columns' in 'geo' metadata must be a dict",
+ ),
+ # missing "encoding" for column
+ (
+ {"primary_column": "foo", "columns": {"foo": {}}, "version": "<version>"},
+ (
+ "'geo' metadata in Parquet/Feather file is missing required key "
+ "'encoding' for column 'foo'"
+ ),
+ ),
+ # invalid column encoding
+ (
+ {
+ "primary_column": "foo",
+ "columns": {"foo": {"crs": None, "encoding": None}},
+ "version": "<version>",
+ },
+ "Only WKB geometry encoding",
+ ),
+ (
+ {
+ "primary_column": "foo",
+ "columns": {"foo": {"crs": None, "encoding": "BKW"}},
+ "version": "<version>",
+ },
+ "Only WKB geometry encoding",
+ ),
+ ],
+)
+def test_validate_geo_metadata_invalid(metadata, error):
+ with pytest.raises(ValueError, match=error):
+ _validate_geo_metadata(metadata)
+
+
+def test_validate_geo_metadata_edges():
+ metadata = {
+ "primary_column": "geometry",
+ "columns": {"geometry": {"crs": None, "encoding": "WKB", "edges": "spherical"}},
+ "version": "1.0.0-beta.1",
+ }
+ with pytest.warns(
+ UserWarning,
+ match="The geo metadata indicate that column 'geometry' has spherical edges",
+ ):
+ _validate_geo_metadata(metadata)
+
+
+def test_to_parquet_fails_on_invalid_engine(tmpdir):
+ df = GeoDataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"], geometry=[Point(1, 1)])
+
+ with pytest.raises(
+ ValueError,
+ match=(
+ "GeoPandas only supports using pyarrow as the engine for "
+ "to_parquet: 'fastparquet' passed instead."
+ ),
+ ):
+ df.to_parquet(tmpdir / "test.parquet", engine="fastparquet")
+
+
+@mock.patch("geopandas.io.arrow._to_parquet")
+def test_to_parquet_does_not_pass_engine_along(mock_to_parquet):
+ df = GeoDataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"], geometry=[Point(1, 1)])
+ df.to_parquet("", engine="pyarrow")
+ # assert that engine keyword is not passed through to _to_parquet (and thus
+ # parquet.write_table)
+ mock_to_parquet.assert_called_with(
+ df,
+ "",
+ compression="snappy",
+ geometry_encoding="WKB",
+ index=None,
+ schema_version=None,
+ write_covering_bbox=False,
+ )
+
+
+# TEMPORARY: used to determine if pyarrow fails for roundtripping pandas data
+# without geometries
+def test_pandas_parquet_roundtrip1(tmpdir):
+ df = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename)
+
+ pq_df = pd_read_parquet(filename)
+
+ assert_frame_equal(df, pq_df)
+
+
+@pytest.mark.parametrize(
+ "test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb_filename"]
+)
+def test_pandas_parquet_roundtrip2(test_dataset, tmpdir, request):
+ path = request.getfixturevalue(test_dataset)
+ df = DataFrame(read_file(path).drop(columns=["geometry"]))
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename)
+
+ pq_df = pd_read_parquet(filename)
+
+ assert_frame_equal(df, pq_df)
+
+
+@pytest.mark.parametrize(
+ "test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb_filename"]
+)
def test_roundtrip(tmpdir, file_format, test_dataset, request):
"""Writing to parquet should not raise errors, and should not alter original
GeoDataFrame
"""
- pass
+ path = request.getfixturevalue(test_dataset)
+ reader, writer = file_format
+
+ df = read_file(path)
+ orig = df.copy()
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+
+ writer(df, filename)
+
+ assert os.path.exists(filename)
+
+ # make sure that the original data frame is unaltered
+ assert_geodataframe_equal(df, orig)
+
+ # make sure that we can roundtrip the data frame
+ pq_df = reader(filename)
+
+ assert isinstance(pq_df, GeoDataFrame)
+ assert_geodataframe_equal(df, pq_df)
def test_index(tmpdir, file_format, naturalearth_lowres):
"""Setting index=`True` should preserve index in output, and
setting index=`False` should drop index from output.
"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres).set_index("iso_a3")
+
+ filename = os.path.join(str(tmpdir), "test_with_index.pq")
+ writer(df, filename, index=True)
+ pq_df = reader(filename)
+ assert_geodataframe_equal(df, pq_df)
+
+ filename = os.path.join(str(tmpdir), "drop_index.pq")
+ writer(df, filename, index=False)
+ pq_df = reader(filename)
+ assert_geodataframe_equal(df.reset_index(drop=True), pq_df)
def test_column_order(tmpdir, file_format, naturalearth_lowres):
"""The order of columns should be preserved in the output."""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+ df = df.set_index("iso_a3")
+ df["geom2"] = df.geometry.representative_point()
+ table = _geopandas_to_arrow(df)
+ custom_column_order = [
+ "iso_a3",
+ "geom2",
+ "pop_est",
+ "continent",
+ "name",
+ "geometry",
+ "gdp_md_est",
+ ]
+ table = table.select(custom_column_order)
+ if reader is read_parquet:
+ filename = os.path.join(str(tmpdir), "test_column_order.pq")
+ pq.write_table(table, filename)
+ else:
+ filename = os.path.join(str(tmpdir), "test_column_order.feather")
+ feather.write_feather(table, filename)
-@pytest.mark.parametrize('compression', ['snappy', 'gzip', 'brotli', None])
+ result = reader(filename)
+ assert list(result.columns) == custom_column_order[1:]
+ assert_geodataframe_equal(result, df[custom_column_order[1:]])
+
+
+@pytest.mark.parametrize("compression", ["snappy", "gzip", "brotli", None])
def test_parquet_compression(compression, tmpdir, naturalearth_lowres):
"""Using compression options should not raise errors, and should
return identical GeoDataFrame.
"""
- pass
+ df = read_file(naturalearth_lowres)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, compression=compression)
+ pq_df = read_parquet(filename)
+
+ assert isinstance(pq_df, GeoDataFrame)
+ assert_geodataframe_equal(df, pq_df)
-@pytest.mark.skipif(Version(pyarrow.__version__) < Version('0.17.0'),
- reason='Feather only supported for pyarrow >= 0.17')
-@pytest.mark.parametrize('compression', ['uncompressed', 'lz4', 'zstd'])
+
+@pytest.mark.skipif(
+ Version(pyarrow.__version__) < Version("0.17.0"),
+ reason="Feather only supported for pyarrow >= 0.17",
+)
+@pytest.mark.parametrize("compression", ["uncompressed", "lz4", "zstd"])
def test_feather_compression(compression, tmpdir, naturalearth_lowres):
"""Using compression options should not raise errors, and should
return identical GeoDataFrame.
"""
- pass
+
+ df = read_file(naturalearth_lowres)
+
+ filename = os.path.join(str(tmpdir), "test.feather")
+ df.to_feather(filename, compression=compression)
+ pq_df = read_feather(filename)
+
+ assert isinstance(pq_df, GeoDataFrame)
+ assert_geodataframe_equal(df, pq_df)
def test_parquet_multiple_geom_cols(tmpdir, file_format, naturalearth_lowres):
"""If multiple geometry columns are present when written to parquet,
they should all be returned as such when read from parquet.
"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+ df["geom2"] = df.geometry.copy()
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ writer(df, filename)
+
+ assert os.path.exists(filename)
+
+ pq_df = reader(filename)
+
+ assert isinstance(pq_df, GeoDataFrame)
+ assert_geodataframe_equal(df, pq_df)
+
+ assert_geoseries_equal(df.geom2, pq_df.geom2, check_geom_type=True)
def test_parquet_missing_metadata(tmpdir, naturalearth_lowres):
"""Missing geo metadata, such as from a parquet file created
from a pandas DataFrame, will raise a ValueError.
"""
- pass
+
+ df = read_file(naturalearth_lowres)
+
+ # convert to DataFrame
+ df = DataFrame(df)
+
+ # convert the geometry column so we can extract later
+ df["geometry"] = to_wkb(df["geometry"].values)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+
+ # use pandas to_parquet (no geo metadata)
+ df.to_parquet(filename)
+
+ # missing metadata will raise ValueError
+ with pytest.raises(
+ ValueError, match="Missing geo metadata in Parquet/Feather file."
+ ):
+ read_parquet(filename)
def test_parquet_missing_metadata2(tmpdir):
@@ -83,53 +554,340 @@ def test_parquet_missing_metadata2(tmpdir):
from a pyarrow Table (which will also not contain pandas metadata),
will raise a ValueError.
"""
- pass
+ import pyarrow.parquet as pq
+ table = pyarrow.table({"a": [1, 2, 3]})
+ filename = os.path.join(str(tmpdir), "test.pq")
-@pytest.mark.parametrize('geo_meta,error', [({'geo': b''},
- 'Missing or malformed geo metadata in Parquet/Feather file'), ({'geo':
- _encode_metadata({})},
- 'Missing or malformed geo metadata in Parquet/Feather file'), ({'geo':
- _encode_metadata({'foo': 'bar'})},
- "'geo' metadata in Parquet/Feather file is missing required key")])
-def test_parquet_invalid_metadata(tmpdir, geo_meta, error, naturalearth_lowres
+ # use pyarrow.parquet write_table (no geo metadata, but also no pandas metadata)
+ pq.write_table(table, filename)
+
+ # missing metadata will raise ValueError
+ with pytest.raises(
+ ValueError, match="Missing geo metadata in Parquet/Feather file."
):
+ read_parquet(filename)
+
+
+@pytest.mark.parametrize(
+ "geo_meta,error",
+ [
+ ({"geo": b""}, "Missing or malformed geo metadata in Parquet/Feather file"),
+ (
+ {"geo": _encode_metadata({})},
+ "Missing or malformed geo metadata in Parquet/Feather file",
+ ),
+ (
+ {"geo": _encode_metadata({"foo": "bar"})},
+ "'geo' metadata in Parquet/Feather file is missing required key",
+ ),
+ ],
+)
+def test_parquet_invalid_metadata(tmpdir, geo_meta, error, naturalearth_lowres):
"""Has geo metadata with missing required fields will raise a ValueError.
This requires writing the parquet file directly below, so that we can
control the metadata that is written for this test.
"""
- pass
+
+ from pyarrow import Table, parquet
+
+ df = read_file(naturalearth_lowres)
+
+ # convert to DataFrame and encode geometry to WKB
+ df = DataFrame(df)
+ df["geometry"] = to_wkb(df["geometry"].values)
+
+ table = Table.from_pandas(df)
+ metadata = table.schema.metadata
+ metadata.update(geo_meta)
+ table = table.replace_schema_metadata(metadata)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ parquet.write_table(table, filename)
+
+ with pytest.raises(ValueError, match=error):
+ read_parquet(filename)
def test_subset_columns(tmpdir, file_format, naturalearth_lowres):
"""Reading a subset of columns should correctly decode selected geometry
columns.
"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ writer(df, filename)
+ pq_df = reader(filename, columns=["name", "geometry"])
+
+ assert_geodataframe_equal(df[["name", "geometry"]], pq_df)
+
+ with pytest.raises(
+ ValueError, match="No geometry columns are included in the columns read"
+ ):
+ reader(filename, columns=["name"])
def test_promote_secondary_geometry(tmpdir, file_format, naturalearth_lowres):
"""Reading a subset of columns that does not include the primary geometry
column should promote the first geometry column present.
"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+ df["geom2"] = df.geometry.copy()
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ writer(df, filename)
+ pq_df = reader(filename, columns=["name", "geom2"])
+
+ assert_geodataframe_equal(df.set_geometry("geom2")[["name", "geom2"]], pq_df)
+
+ df["geom3"] = df.geometry.copy()
+
+ writer(df, filename)
+ with pytest.warns(
+ UserWarning,
+ match="Multiple non-primary geometry columns read from Parquet/Feather file.",
+ ):
+ pq_df = reader(filename, columns=["name", "geom2", "geom3"])
+
+ assert_geodataframe_equal(
+ df.set_geometry("geom2")[["name", "geom2", "geom3"]], pq_df
+ )
def test_columns_no_geometry(tmpdir, file_format, naturalearth_lowres):
"""Reading a parquet file that is missing all of the geometry columns
should raise a ValueError"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ writer(df, filename)
+
+ with pytest.raises(ValueError):
+ reader(filename, columns=["name"])
def test_missing_crs(tmpdir, file_format, naturalearth_lowres):
"""If CRS is `None`, it should be properly handled
and remain `None` when read from parquet`.
"""
- pass
+ reader, writer = file_format
+
+ df = read_file(naturalearth_lowres)
+ df.geometry.array.crs = None
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ writer(df, filename)
+ pq_df = reader(filename)
+
+ assert pq_df.crs is None
+
+ assert_geodataframe_equal(df, pq_df, check_crs=True)
+
+
+def test_default_geo_col_writes(tmp_path):
+ # edge case geo col name None writes successfully
+ df = GeoDataFrame({"a": [1, 2]})
+ df.to_parquet(tmp_path / "test.pq")
+ # cannot be round tripped as gdf due to invalid geom col
+ pq_df = pd_read_parquet(tmp_path / "test.pq")
+ assert_frame_equal(df, pq_df)
+
+
+@pytest.mark.skipif(
+ Version(pyarrow.__version__) >= Version("0.17.0"),
+ reason="Feather only supported for pyarrow >= 0.17",
+)
+def test_feather_arrow_version(tmpdir, naturalearth_lowres):
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.feather")
+
+ with pytest.raises(
+ ImportError, match="pyarrow >= 0.17 required for Feather support"
+ ):
+ df.to_feather(filename)
-@pytest.mark.parametrize('version', ['0.1.0', '0.4.0', '1.0.0-beta.1'])
+def test_fsspec_url(naturalearth_lowres):
+ fsspec = pytest.importorskip("fsspec")
+ import fsspec.implementations.memory
+
+ class MyMemoryFileSystem(fsspec.implementations.memory.MemoryFileSystem):
+ # Simple fsspec filesystem that adds a required keyword.
+ # Attempting to use this filesystem without the keyword will raise an exception.
+ def __init__(self, is_set, *args, **kwargs):
+ self.is_set = is_set
+ super().__init__(*args, **kwargs)
+
+ fsspec.register_implementation("memory", MyMemoryFileSystem, clobber=True)
+ memfs = MyMemoryFileSystem(is_set=True)
+
+ df = read_file(naturalearth_lowres)
+
+ with memfs.open("data.parquet", "wb") as f:
+ df.to_parquet(f)
+
+ result = read_parquet("memory://data.parquet", storage_options={"is_set": True})
+ assert_geodataframe_equal(result, df)
+
+ result = read_parquet("memory://data.parquet", filesystem=memfs)
+ assert_geodataframe_equal(result, df)
+
+ # reset fsspec registry
+ fsspec.register_implementation(
+ "memory", fsspec.implementations.memory.MemoryFileSystem, clobber=True
+ )
+
+
+def test_non_fsspec_url_with_storage_options_raises(naturalearth_lowres):
+ with pytest.raises(ValueError, match="storage_options"):
+ read_parquet(naturalearth_lowres, storage_options={"foo": "bar"})
+
+
+@pytest.mark.skipif(
+ Version(pyarrow.__version__) < Version("5.0.0"),
+ reason="pyarrow.fs requires pyarrow>=5.0.0",
+)
+def test_prefers_pyarrow_fs():
+ filesystem, _ = _get_filesystem_path("file:///data.parquet")
+ assert isinstance(filesystem, pyarrow.fs.LocalFileSystem)
+
+
+def test_write_read_parquet_expand_user():
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
+ test_file = "~/test_file.parquet"
+ gdf.to_parquet(test_file)
+ pq_df = geopandas.read_parquet(test_file)
+ assert_geodataframe_equal(gdf, pq_df, check_crs=True)
+ os.remove(os.path.expanduser(test_file))
+
+
+def test_write_read_feather_expand_user():
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
+ test_file = "~/test_file.feather"
+ gdf.to_feather(test_file)
+ f_df = geopandas.read_feather(test_file)
+ assert_geodataframe_equal(gdf, f_df, check_crs=True)
+ os.remove(os.path.expanduser(test_file))
+
+
+@pytest.mark.parametrize("geometry", [[], [None]])
+def test_write_empty_bbox(tmpdir, geometry):
+ # empty dataframe or all missing geometries -> avoid bbox with NaNs
+ gdf = geopandas.GeoDataFrame({"col": [1] * len(geometry)}, geometry=geometry)
+ gdf.to_parquet(tmpdir / "test.parquet")
+
+ from pyarrow.parquet import read_table
+
+ table = read_table(tmpdir / "test.parquet")
+ metadata = json.loads(table.schema.metadata[b"geo"])
+ assert "encoding" in metadata["columns"]["geometry"]
+ assert "bbox" not in metadata["columns"]["geometry"]
+
+
+@pytest.mark.parametrize("format", ["feather", "parquet"])
+def test_write_read_default_crs(tmpdir, format):
+ pyproj = pytest.importorskip("pyproj")
+ if format == "feather":
+ from pyarrow.feather import write_feather as write
+ else:
+ from pyarrow.parquet import write_table as write
+
+ filename = os.path.join(str(tmpdir), f"test.{format}")
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)])
+ table = _geopandas_to_arrow(gdf)
+
+ # update the geo metadata to strip 'crs' entry
+ metadata = table.schema.metadata
+ geo_metadata = _decode_metadata(metadata[b"geo"])
+ del geo_metadata["columns"]["geometry"]["crs"]
+ metadata.update({b"geo": _encode_metadata(geo_metadata)})
+ table = table.replace_schema_metadata(metadata)
+
+ write(table, filename)
+
+ read = getattr(geopandas, f"read_{format}")
+ df = read(filename)
+ assert df.crs.equals(pyproj.CRS("OGC:CRS84"))
+
+
+@pytest.mark.skipif(shapely.geos_version < (3, 10, 0), reason="requires GEOS>=3.10")
+def test_write_iso_wkb(tmpdir):
+ gdf = geopandas.GeoDataFrame(
+ geometry=geopandas.GeoSeries.from_wkt(["POINT Z (1 2 3)"])
+ )
+ gdf.to_parquet(tmpdir / "test.parquet")
+
+ from pyarrow.parquet import read_table
+
+ table = read_table(tmpdir / "test.parquet")
+ wkb = table["geometry"][0].as_py().hex()
+
+ # correct ISO flavor
+ assert wkb == "01e9030000000000000000f03f00000000000000400000000000000840"
+
+
+@pytest.mark.skipif(shapely.geos_version >= (3, 10, 0), reason="tests GEOS<3.10")
+def test_write_iso_wkb_old_geos(tmpdir):
+ gdf = geopandas.GeoDataFrame(
+ geometry=geopandas.GeoSeries.from_wkt(["POINT Z (1 2 3)"])
+ )
+ with pytest.raises(ValueError, match="Cannot write 3D"):
+ gdf.to_parquet(tmpdir / "test.parquet")
+
+
+@pytest.mark.parametrize(
+ "format,schema_version",
+ product(["feather", "parquet"], [None] + SUPPORTED_VERSIONS),
+)
+def test_write_spec_version(tmpdir, format, schema_version):
+ if format == "feather":
+ from pyarrow.feather import read_table
+ else:
+ from pyarrow.parquet import read_table
+
+ filename = os.path.join(str(tmpdir), f"test.{format}")
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
+ write = getattr(gdf, f"to_{format}")
+ write(filename, schema_version=schema_version)
+
+ # ensure that we can roundtrip data regardless of version
+ read = getattr(geopandas, f"read_{format}")
+ df = read(filename)
+ assert_geodataframe_equal(df, gdf)
+
+ # verify the correct version is written in the metadata
+ schema_version = schema_version or METADATA_VERSION
+ table = read_table(filename)
+ metadata = json.loads(table.schema.metadata[b"geo"])
+ assert metadata["version"] == schema_version
+
+ # verify that CRS is correctly handled between versions
+ if HAS_PYPROJ:
+ if schema_version == "0.1.0":
+ assert metadata["columns"]["geometry"]["crs"] == gdf.crs.to_wkt()
+
+ else:
+ crs_expected = gdf.crs.to_json_dict()
+ _remove_id_from_member_of_ensembles(crs_expected)
+ assert metadata["columns"]["geometry"]["crs"] == crs_expected
+
+ # verify that geometry_type(s) is correctly handled between versions
+ if Version(schema_version) <= Version("0.4.0"):
+ assert "geometry_type" in metadata["columns"]["geometry"]
+ assert metadata["columns"]["geometry"]["geometry_type"] == "Polygon"
+ else:
+ assert "geometry_types" in metadata["columns"]["geometry"]
+ assert metadata["columns"]["geometry"]["geometry_types"] == ["Polygon"]
+
+
+@pytest.mark.parametrize("version", ["0.1.0", "0.4.0", "1.0.0-beta.1"])
def test_read_versioned_file(version):
"""
Verify that files for different metadata spec versions can be read
@@ -145,7 +903,17 @@ def test_read_versioned_file(version):
df.to_feather(DATA_PATH / 'arrow' / f'test_data_v{METADATA_VERSION}.feather')
df.to_parquet(DATA_PATH / 'arrow' / f'test_data_v{METADATA_VERSION}.parquet')
"""
- pass
+ expected = geopandas.GeoDataFrame(
+ {"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
+ geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5, 5)],
+ crs="EPSG:4326",
+ )
+
+ df = geopandas.read_feather(DATA_PATH / "arrow" / f"test_data_v{version}.feather")
+ assert_geodataframe_equal(df, expected, check_crs=True)
+
+ df = geopandas.read_parquet(DATA_PATH / "arrow" / f"test_data_v{version}.parquet")
+ assert_geodataframe_equal(df, expected, check_crs=True)
def test_read_gdal_files():
@@ -169,5 +937,396 @@ def test_read_gdal_files():
Repeated for GDAL 3.9 which adds a bbox covering column:
$ ogr2ogr -f Parquet -lco FID= test_data_gdal390.parquet test_data.gpkg
- """
- pass
+ """ # noqa: E501
+ pytest.importorskip("pyproj")
+ expected = geopandas.GeoDataFrame(
+ {"col_str": ["a", "b"], "col_int": [1, 2], "col_float": [0.1, 0.2]},
+ geometry=[MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3)]), box(4, 4, 5, 5)],
+ crs="EPSG:4326",
+ )
+
+ df = geopandas.read_parquet(DATA_PATH / "arrow" / "test_data_gdal350.parquet")
+ assert_geodataframe_equal(df, expected, check_crs=True)
+
+ df = geopandas.read_feather(DATA_PATH / "arrow" / "test_data_gdal350.arrow")
+ assert_geodataframe_equal(df, expected, check_crs=True)
+
+ df = geopandas.read_parquet(DATA_PATH / "arrow" / "test_data_gdal390.parquet")
+ # recent GDAL no longer writes CRS in metadata in case of EPSG:4326, so comes back
+ # as default OGC:CRS84
+ expected = expected.to_crs("OGC:CRS84")
+ assert_geodataframe_equal(df, expected, check_crs=True)
+
+ df = geopandas.read_parquet(
+ DATA_PATH / "arrow" / "test_data_gdal390.parquet", bbox=(0, 0, 2, 2)
+ )
+ assert len(df) == 1
+
+
+def test_parquet_read_partitioned_dataset(tmpdir, naturalearth_lowres):
+ # we don't yet explicitly support this (in writing), but for Parquet it
+ # works for reading (by relying on pyarrow.read_table)
+ df = read_file(naturalearth_lowres)
+
+ # manually create partitioned dataset
+ basedir = tmpdir / "partitioned_dataset"
+ basedir.mkdir()
+ df[:100].to_parquet(basedir / "data1.parquet")
+ df[100:].to_parquet(basedir / "data2.parquet")
+
+ result = read_parquet(basedir)
+ assert_geodataframe_equal(result, df)
+
+
+def test_parquet_read_partitioned_dataset_fsspec(tmpdir, naturalearth_lowres):
+ fsspec = pytest.importorskip("fsspec")
+
+ df = read_file(naturalearth_lowres)
+
+ # manually create partitioned dataset
+ memfs = fsspec.filesystem("memory")
+ memfs.mkdir("partitioned_dataset")
+ with memfs.open("partitioned_dataset/data1.parquet", "wb") as f:
+ df[:100].to_parquet(f)
+ with memfs.open("partitioned_dataset/data2.parquet", "wb") as f:
+ df[100:].to_parquet(f)
+
+ result = read_parquet("memory://partitioned_dataset")
+ assert_geodataframe_equal(result, df)
+
+
+@pytest.mark.parametrize(
+ "geometry_type",
+ ["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
+)
+def test_read_parquet_geoarrow(geometry_type):
+ result = geopandas.read_parquet(
+ DATA_PATH
+ / "arrow"
+ / "geoparquet"
+ / f"data-{geometry_type}-encoding_native.parquet"
+ )
+ expected = geopandas.read_parquet(
+ DATA_PATH
+ / "arrow"
+ / "geoparquet"
+ / f"data-{geometry_type}-encoding_wkb.parquet"
+ )
+ assert_geodataframe_equal(result, expected, check_crs=True)
+
+
+@pytest.mark.parametrize(
+ "geometry_type",
+ ["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
+)
+def test_geoarrow_roundtrip(tmp_path, geometry_type):
+
+ df = geopandas.read_parquet(
+ DATA_PATH
+ / "arrow"
+ / "geoparquet"
+ / f"data-{geometry_type}-encoding_wkb.parquet"
+ )
+
+ df.to_parquet(tmp_path / "test.parquet", geometry_encoding="geoarrow")
+ result = geopandas.read_parquet(tmp_path / "test.parquet")
+ assert_geodataframe_equal(result, df, check_crs=True)
+
+
+def test_to_parquet_bbox_structure_and_metadata(tmpdir, naturalearth_lowres):
+ # check metadata being written for covering.
+ from pyarrow import parquet
+
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ table = parquet.read_table(filename)
+ metadata = json.loads(table.schema.metadata[b"geo"].decode("utf-8"))
+ assert metadata["columns"]["geometry"]["covering"] == {
+ "bbox": {
+ "xmin": ["bbox", "xmin"],
+ "ymin": ["bbox", "ymin"],
+ "xmax": ["bbox", "xmax"],
+ "ymax": ["bbox", "ymax"],
+ }
+ }
+ assert "bbox" in table.schema.names
+ assert [field.name for field in table.schema.field("bbox").type] == [
+ "xmin",
+ "ymin",
+ "xmax",
+ "ymax",
+ ]
+
+
+@pytest.mark.parametrize(
+ "geometry, expected_bbox",
+ [
+ (Point(1, 3), {"xmin": 1.0, "ymin": 3.0, "xmax": 1.0, "ymax": 3.0}),
+ (
+ LineString([(1, 1), (3, 3)]),
+ {"xmin": 1.0, "ymin": 1.0, "xmax": 3.0, "ymax": 3.0},
+ ),
+ (
+ Polygon([(2, 1), (1, 2), (2, 3), (3, 2)]),
+ {"xmin": 1.0, "ymin": 1.0, "xmax": 3.0, "ymax": 3.0},
+ ),
+ (
+ MultiPolygon([box(0, 0, 1, 1), box(2, 2, 3, 3), box(4, 4, 5, 5)]),
+ {"xmin": 0.0, "ymin": 0.0, "xmax": 5.0, "ymax": 5.0},
+ ),
+ ],
+ ids=["Point", "LineString", "Polygon", "Multipolygon"],
+)
+def test_to_parquet_bbox_values(tmpdir, geometry, expected_bbox):
+ # check bbox bounds being written for different geometry types.
+ import pyarrow.parquet as pq
+
+ df = GeoDataFrame(data=[[1, 2]], columns=["a", "b"], geometry=[geometry])
+ filename = os.path.join(str(tmpdir), "test.pq")
+
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ result = pq.read_table(filename).to_pandas()
+ assert result["bbox"][0] == expected_bbox
+
+
+def test_read_parquet_bbox_single_point(tmpdir):
+ # confirm that on a single point, bbox will pick it up.
+ df = GeoDataFrame(data=[[1, 2]], columns=["a", "b"], geometry=[Point(1, 1)])
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+ pq_df = read_parquet(filename, bbox=(1, 1, 1, 1))
+ assert len(pq_df) == 1
+ assert pq_df.geometry[0] == Point(1, 1)
+
+
+@pytest.mark.parametrize("geometry_name", ["geometry", "custum_geom_col"])
+def test_read_parquet_bbox(tmpdir, naturalearth_lowres, geometry_name):
+ # check bbox is being used to filter results.
+ df = read_file(naturalearth_lowres)
+ if geometry_name != "geometry":
+ df = df.rename_geometry(geometry_name)
+
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ pq_df = read_parquet(filename, bbox=(0, 0, 10, 10))
+
+ assert pq_df["name"].values.tolist() == [
+ "France",
+ "Benin",
+ "Nigeria",
+ "Cameroon",
+ "Togo",
+ "Ghana",
+ "Burkina Faso",
+ "Gabon",
+ "Eq. Guinea",
+ ]
+
+
+@pytest.mark.parametrize("geometry_name", ["geometry", "custum_geom_col"])
+def test_read_parquet_bbox_partitioned(tmpdir, naturalearth_lowres, geometry_name):
+ # check bbox is being used to filter results on partioned data.
+ df = read_file(naturalearth_lowres)
+ if geometry_name != "geometry":
+ df = df.rename_geometry(geometry_name)
+
+ # manually create partitioned dataset
+ basedir = tmpdir / "partitioned_dataset"
+ basedir.mkdir()
+ df[:100].to_parquet(basedir / "data1.parquet", write_covering_bbox=True)
+ df[100:].to_parquet(basedir / "data2.parquet", write_covering_bbox=True)
+
+ pq_df = read_parquet(basedir, bbox=(0, 0, 10, 10))
+
+ assert pq_df["name"].values.tolist() == [
+ "France",
+ "Benin",
+ "Nigeria",
+ "Cameroon",
+ "Togo",
+ "Ghana",
+ "Burkina Faso",
+ "Gabon",
+ "Eq. Guinea",
+ ]
+
+
+@pytest.mark.parametrize(
+ "geometry, bbox",
+ [
+ (LineString([(1, 1), (3, 3)]), (1.5, 1.5, 3.5, 3.5)),
+ (LineString([(1, 1), (3, 3)]), (3, 3, 3, 3)),
+ (LineString([(1, 1), (3, 3)]), (1.5, 1.5, 2.5, 2.5)),
+ (Polygon([(0, 0), (4, 0), (4, 4), (0, 4)]), (1, 1, 3, 3)),
+ (Polygon([(0, 0), (4, 0), (4, 4), (0, 4)]), (1, 1, 5, 5)),
+ (Polygon([(0, 0), (4, 0), (4, 4), (0, 4)]), (2, 2, 4, 4)),
+ (Polygon([(0, 0), (4, 0), (4, 4), (0, 4)]), (4, 4, 4, 4)),
+ (Polygon([(0, 0), (4, 0), (4, 4), (0, 4)]), (1, 1, 5, 3)),
+ ],
+)
+def test_read_parquet_bbox_partial_overlap_of_geometry(tmpdir, geometry, bbox):
+ df = GeoDataFrame(data=[[1, 2]], columns=["a", "b"], geometry=[geometry])
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ pq_df = read_parquet(filename, bbox=bbox)
+ assert len(pq_df) == 1
+
+
+def test_read_parquet_no_bbox(tmpdir, naturalearth_lowres):
+ # check error message when parquet lacks a bbox column but
+ # want to use bbox kwarg in read_parquet.
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename)
+ with pytest.raises(ValueError, match="Specifying 'bbox' not supported"):
+ read_parquet(filename, bbox=(0, 0, 20, 20))
+
+
+def test_read_parquet_no_bbox_partitioned(tmpdir, naturalearth_lowres):
+ # check error message when partitioned parquet data does not have
+ # a bbox column but want to use kwarg to read_parquet.
+ df = read_file(naturalearth_lowres)
+
+ # manually create partitioned dataset
+ basedir = tmpdir / "partitioned_dataset"
+ basedir.mkdir()
+ df[:100].to_parquet(basedir / "data1.parquet")
+ df[100:].to_parquet(basedir / "data2.parquet")
+
+ with pytest.raises(ValueError, match="Specifying 'bbox' not supported"):
+ read_parquet(basedir, bbox=(0, 0, 20, 20))
+
+
+def test_convert_bbox_to_parquet_filter():
+ # check conversion of bbox to parquet filter expression
+ import pyarrow.compute as pc
+
+ bbox = (0, 0, 25, 35)
+ expected = ~(
+ (pc.field(("bbox", "xmin")) > 25)
+ | (pc.field(("bbox", "ymin")) > 35)
+ | (pc.field(("bbox", "xmax")) < 0)
+ | (pc.field(("bbox", "ymax")) < 0)
+ )
+ assert expected.equals(_convert_bbox_to_parquet_filter(bbox, "bbox"))
+
+
+def test_read_parquet_bbox_column_default_behaviour(tmpdir, naturalearth_lowres):
+ # check that bbox column is not read in by default
+
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+ result1 = read_parquet(filename)
+ assert "bbox" not in result1
+
+ result2 = read_parquet(filename, columns=["name", "geometry"])
+ assert "bbox" not in result2
+ assert list(result2.columns) == ["name", "geometry"]
+
+
+@pytest.mark.parametrize(
+ "filters",
+ [
+ [("gdp_md_est", ">", 20000)],
+ pc.field("gdp_md_est") > 20000,
+ ],
+)
+def test_read_parquet_filters_and_bbox(tmpdir, naturalearth_lowres, filters):
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ result = read_parquet(filename, filters=filters, bbox=(0, 0, 20, 20))
+ assert result["name"].values.tolist() == [
+ "Dem. Rep. Congo",
+ "France",
+ "Nigeria",
+ "Cameroon",
+ "Ghana",
+ "Algeria",
+ "Libya",
+ ]
+
+
+@pytest.mark.parametrize(
+ "filters",
+ [
+ ([("gdp_md_est", ">", 15000), ("gdp_md_est", "<", 16000)]),
+ ((pc.field("gdp_md_est") > 15000) & (pc.field("gdp_md_est") < 16000)),
+ ],
+)
+def test_read_parquet_filters_without_bbox(tmpdir, naturalearth_lowres, filters):
+ df = read_file(naturalearth_lowres)
+ filename = os.path.join(str(tmpdir), "test.pq")
+ df.to_parquet(filename, write_covering_bbox=True)
+
+ result = read_parquet(filename, filters=filters)
+ assert result["name"].values.tolist() == ["Burkina Faso", "Mozambique", "Albania"]
+
+
+def test_read_parquet_file_with_custom_bbox_encoding_fieldname(tmpdir):
+ import pyarrow.parquet as pq
+
+ data = {
+ "name": ["point1", "point2", "point3"],
+ "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)],
+ }
+ df = GeoDataFrame(data)
+ filename = os.path.join(str(tmpdir), "test.pq")
+
+ table = _geopandas_to_arrow(
+ df,
+ schema_version="1.1.0",
+ write_covering_bbox=True,
+ )
+ metadata = table.schema.metadata # rename_columns results in wiping of metadata
+
+ table = table.rename_columns(["name", "geometry", "custom_bbox_name"])
+
+ geo_metadata = json.loads(metadata[b"geo"])
+ geo_metadata["columns"]["geometry"]["covering"]["bbox"] = {
+ "xmin": ["custom_bbox_name", "xmin"],
+ "ymin": ["custom_bbox_name", "ymin"],
+ "xmax": ["custom_bbox_name", "xmax"],
+ "ymax": ["custom_bbox_name", "ymax"],
+ }
+ metadata.update({b"geo": _encode_metadata(geo_metadata)})
+
+ table = table.replace_schema_metadata(metadata)
+ pq.write_table(table, filename)
+
+ pq_table = pq.read_table(filename)
+ assert "custom_bbox_name" in pq_table.schema.names
+
+ pq_df = read_parquet(filename, bbox=(1.5, 1.5, 2.5, 2.5))
+ assert pq_df["name"].values.tolist() == ["point2"]
+
+
+def test_to_parquet_with_existing_bbox_column(tmpdir, naturalearth_lowres):
+ df = read_file(naturalearth_lowres)
+ df = df.assign(bbox=[0] * len(df))
+ filename = os.path.join(str(tmpdir), "test.pq")
+
+ with pytest.raises(
+ ValueError, match="An existing column 'bbox' already exists in the dataframe"
+ ):
+ df.to_parquet(filename, write_covering_bbox=True)
+
+
+def test_read_parquet_bbox_points(tmp_path):
+ # check bbox filtering on point geometries
+ df = geopandas.GeoDataFrame(
+ {"col": range(10)}, geometry=[Point(i, i) for i in range(10)]
+ )
+ df.to_parquet(tmp_path / "test.parquet", geometry_encoding="geoarrow")
+
+ result = geopandas.read_parquet(tmp_path / "test.parquet", bbox=(0, 0, 10, 10))
+ assert len(result) == 10
+ result = geopandas.read_parquet(tmp_path / "test.parquet", bbox=(3, 3, 5, 5))
+ assert len(result) == 3
diff --git a/geopandas/io/tests/test_file.py b/geopandas/io/tests/test_file.py
index c64ca09a..014b1f3c 100644
--- a/geopandas/io/tests/test_file.py
+++ b/geopandas/io/tests/test_file.py
@@ -7,93 +7,412 @@ import shutil
import tempfile
from collections import OrderedDict
from packaging.version import Version
+
import numpy as np
import pandas as pd
import pytz
from pandas.api.types import is_datetime64_any_dtype
+
from shapely.geometry import Point, Polygon, box, mapping
+
import geopandas
from geopandas import GeoDataFrame, read_file
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20, PANDAS_GE_30
from geopandas.io.file import _EXTENSION_TO_DRIVER, _detect_driver
+
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from geopandas.tests.util import PACKAGE_DIR, validate_boro_df
from pandas.testing import assert_frame_equal, assert_series_equal
+
try:
import pyogrio
- PYOGRIO_GE_090 = Version(Version(pyogrio.__version__).base_version
- ) >= Version('0.9.0')
+
+ # those version checks have to be defined here instead of imported from
+ # geopandas.io.file (those are only initialized lazily on first usage)
+ PYOGRIO_GE_090 = Version(Version(pyogrio.__version__).base_version) >= Version(
+ "0.9.0"
+ )
except ImportError:
pyogrio = False
PYOGRIO_GE_090 = False
+
+
try:
import fiona
- FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
- '1.9.0')
+
+ FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version("1.9.0")
except ImportError:
fiona = False
FIONA_GE_19 = False
-PYOGRIO_MARK = pytest.mark.skipif(not pyogrio, reason='pyogrio not installed')
-FIONA_MARK = pytest.mark.skipif(not fiona, reason='fiona not installed')
-_CRS = 'epsg:4326'
-pytestmark = pytest.mark.filterwarnings('ignore:Value:RuntimeWarning:pyogrio')
-driver_ext_pairs = [('ESRI Shapefile', '.shp'), ('GeoJSON', '.geojson'), (
- 'GPKG', '.gpkg'), (None, '.shp'), (None, ''), (None, '.geojson'), (None,
- '.gpkg')]
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+PYOGRIO_MARK = pytest.mark.skipif(not pyogrio, reason="pyogrio not installed")
+FIONA_MARK = pytest.mark.skipif(not fiona, reason="fiona not installed")
+
+
+_CRS = "epsg:4326"
+
+
+pytestmark = pytest.mark.filterwarnings("ignore:Value:RuntimeWarning:pyogrio")
+
+
+@pytest.fixture(
+ params=[
+ pytest.param("fiona", marks=FIONA_MARK),
+ pytest.param("pyogrio", marks=PYOGRIO_MARK),
+ ]
+)
+def engine(request):
+ return request.param
+
+
+def skip_pyogrio_not_supported(engine):
+ if engine == "pyogrio":
+ pytest.skip("not supported for the pyogrio engine")
+
+
+@pytest.fixture
+def df_nybb(engine, nybb_filename):
+ df = read_file(nybb_filename, engine=engine)
+ return df
+
+
+@pytest.fixture
+def df_null():
+ return read_file(
+ os.path.join(PACKAGE_DIR, "geopandas", "tests", "data", "null_geom.geojson")
+ )
+
+
+@pytest.fixture
+def file_path():
+ return os.path.join(PACKAGE_DIR, "geopandas", "tests", "data", "null_geom.geojson")
+
+
+@pytest.fixture
+def df_points():
+ N = 10
+ crs = _CRS
+ df = GeoDataFrame(
+ [
+ {"geometry": Point(x, y), "value1": x + y, "value2": x * y}
+ for x, y in zip(range(N), range(N))
+ ],
+ crs=crs,
+ )
+ return df
+
+
+# -----------------------------------------------------------------------------
+# to_file tests
+# -----------------------------------------------------------------------------
+
+driver_ext_pairs = [
+ ("ESRI Shapefile", ".shp"),
+ ("GeoJSON", ".geojson"),
+ ("GPKG", ".gpkg"),
+ (None, ".shp"),
+ (None, ""),
+ (None, ".geojson"),
+ (None, ".gpkg"),
+]
+
+
+def assert_correct_driver(file_path, ext, engine):
+ # check the expected driver
+ expected_driver = "ESRI Shapefile" if ext == "" else _EXTENSION_TO_DRIVER[ext]
+
+ if engine == "fiona":
+ with fiona.open(str(file_path)) as fds:
+ assert fds.driver == expected_driver
+ else:
+ # TODO pyogrio doesn't yet provide a way to check the driver of a file
+ return
+
+
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file(tmpdir, df_nybb, df_null, driver, ext, engine):
"""Test to_file and from_file"""
- pass
+ tempfilename = os.path.join(str(tmpdir), "boros." + ext)
+ df_nybb.to_file(tempfilename, driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert "geometry" in df
+ assert len(df) == 5
+ assert np.all(df["BoroName"].values == df_nybb["BoroName"])
+ # Write layer with null geometry out to file
+ tempfilename = os.path.join(str(tmpdir), "null_geom" + ext)
+ df_null.to_file(tempfilename, driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert "geometry" in df
+ assert len(df) == 2
+ assert np.all(df["Name"].values == df_null["Name"])
+ # check the expected driver
+ assert_correct_driver(tempfilename, ext, engine)
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_pathlib(tmpdir, df_nybb, driver, ext, engine):
"""Test to_file and from_file"""
- pass
+ temppath = pathlib.Path(os.path.join(str(tmpdir), "boros." + ext))
+ df_nybb.to_file(temppath, driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(temppath, engine=engine)
+ assert "geometry" in df
+ assert len(df) == 5
+ assert np.all(df["BoroName"].values == df_nybb["BoroName"])
+ # check the expected driver
+ assert_correct_driver(temppath, ext, engine)
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_bool(tmpdir, driver, ext, engine):
"""Test error raise when writing with a boolean column (GH #437)."""
- pass
+ tempfilename = os.path.join(str(tmpdir), "temp.{0}".format(ext))
+ df = GeoDataFrame(
+ {
+ "col": [True, False, True],
+ "geometry": [Point(0, 0), Point(1, 1), Point(2, 2)],
+ },
+ crs=4326,
+ )
+
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ result = read_file(tempfilename, engine=engine)
+ if ext in (".shp", ""):
+ # Shapefile does not support boolean, so is read back as int
+ # but since GDAL 3.9 supports boolean fields in SHP
+ if engine == "fiona" and fiona.gdal_version.minor < 9:
+ df["col"] = df["col"].astype("int64")
+ elif engine == "pyogrio" and pyogrio.__gdal_version__ < (3, 9):
+ df["col"] = df["col"].astype("int32")
+ assert_geodataframe_equal(result, df)
+ # check the expected driver
+ assert_correct_driver(tempfilename, ext, engine)
TEST_DATE = datetime.datetime(2021, 11, 21, 1, 7, 43, 17500)
-eastern = pytz.timezone('America/New_York')
-datetime_type_tests = TEST_DATE, eastern.localize(TEST_DATE)
+eastern = pytz.timezone("America/New_York")
+
+datetime_type_tests = (TEST_DATE, eastern.localize(TEST_DATE))
@pytest.mark.filterwarnings(
- 'ignore:Non-conformant content for record 1 in column b:RuntimeWarning')
-@pytest.mark.parametrize('time', datetime_type_tests, ids=('naive_datetime',
- 'datetime_with_timezone'))
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+ "ignore:Non-conformant content for record 1 in column b:RuntimeWarning"
+) # for GPKG, GDAL writes the tz data but warns on reading (see DATETIME_FORMAT option)
+@pytest.mark.parametrize(
+ "time", datetime_type_tests, ids=("naive_datetime", "datetime_with_timezone")
+)
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_datetime(tmpdir, driver, ext, time, engine):
"""Test writing a data file with the datetime column type"""
- pass
+ if ext in (".shp", ""):
+ pytest.skip(f"Driver corresponding to ext {ext} doesn't support dt fields")
+
+ tempfilename = os.path.join(str(tmpdir), f"test_datetime{ext}")
+ point = Point(0, 0)
+
+ df = GeoDataFrame(
+ {"a": [1.0, 2.0], "b": [time, time]}, geometry=[point, point], crs=4326
+ )
+ df["b"] = df["b"].dt.round(freq="ms")
+
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ df_read = read_file(tempfilename, engine=engine)
+
+ assert_geodataframe_equal(df.drop(columns=["b"]), df_read.drop(columns=["b"]))
+ # Check datetime column
+ expected = df["b"]
+ if PANDAS_GE_20:
+ expected = df["b"].dt.as_unit("ms")
+ actual = df_read["b"]
+ if df["b"].dt.tz is not None:
+ # US/Eastern becomes pytz.FixedOffset(-300) when read from file
+ # as GDAL only models offsets, not timezones.
+ # Compare fair result in terms of UTC instead
+ expected = expected.dt.tz_convert(pytz.utc)
+ actual = actual.dt.tz_convert(pytz.utc)
+
+ assert_series_equal(expected, actual)
+
+
+dt_exts = ["gpkg", "geojson"]
+
+
+def write_invalid_date_file(date_str, tmpdir, ext, engine):
+ tempfilename = os.path.join(str(tmpdir), f"test_invalid_datetime.{ext}")
+ df = GeoDataFrame(
+ {
+ "date": ["2014-08-26T10:01:23", "2014-08-26T10:01:23", date_str],
+ "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)],
+ }
+ )
+ # Schema not required for GeoJSON since not typed, but needed for GPKG
+ if ext == "geojson":
+ df.to_file(tempfilename, engine=engine)
+ else:
+ schema = {"geometry": "Point", "properties": {"date": "datetime"}}
+ if engine == "pyogrio" and not fiona:
+ # (use schema to write the invalid date without pandas datetimes
+ pytest.skip("test requires fiona kwarg schema")
+ df.to_file(tempfilename, schema=schema, engine="fiona")
+ return tempfilename
+
+
+@pytest.mark.parametrize("ext", dt_exts)
+def test_read_file_datetime_invalid(tmpdir, ext, engine):
+ # https://github.com/geopandas/geopandas/issues/2502
+ date_str = "9999-99-99T00:00:00" # invalid date handled by GDAL
+ tempfilename = write_invalid_date_file(date_str, tmpdir, ext, engine)
+ res = read_file(tempfilename, engine=engine)
+ if ext == "gpkg":
+ assert is_datetime64_any_dtype(res["date"])
+ assert pd.isna(res["date"].iloc[-1])
+ else:
+ assert res["date"].dtype == "object"
+ assert isinstance(res["date"].iloc[-1], str)
+
+@pytest.mark.parametrize("ext", dt_exts)
+def test_read_file_datetime_out_of_bounds_ns(tmpdir, ext, engine):
+ if engine == "pyogrio" and not PANDAS_GE_20:
+ pytest.skip("with pyogrio requires pandas >= 2.0 to pass")
+ # https://github.com/geopandas/geopandas/issues/2502
+ date_str = "9999-12-31T00:00:00" # valid to GDAL, not to [ns] format
+ tempfilename = write_invalid_date_file(date_str, tmpdir, ext, engine)
+ res = read_file(tempfilename, engine=engine)
+ if PANDAS_GE_30:
+ assert res["date"].dtype == "datetime64[ms]"
+ assert res["date"].iloc[-1] == pd.Timestamp("9999-12-31 00:00:00")
+ else:
+ # Pandas invalid datetimes are read in as object dtype (strings)
+ assert res["date"].dtype == "object"
+ assert isinstance(res["date"].iloc[0], str)
-dt_exts = ['gpkg', 'geojson']
+
+def test_read_file_datetime_mixed_offsets(tmpdir):
+ # https://github.com/geopandas/geopandas/issues/2478
+ tempfilename = os.path.join(str(tmpdir), "test_mixed_datetime.geojson")
+ df = GeoDataFrame(
+ {
+ "date": [
+ "2014-08-26 10:01:23.040001+02:00",
+ "2019-03-07 17:31:43.118999+01:00",
+ ],
+ "geometry": [Point(1, 1), Point(1, 1)],
+ }
+ )
+ df.to_file(tempfilename)
+ # check mixed tz don't crash GH2478
+ res = read_file(tempfilename)
+ # Convert mixed timezones to UTC equivalent
+ assert is_datetime64_any_dtype(res["date"])
+ if not PANDAS_GE_20:
+ utc = pytz.utc
+ else:
+ utc = datetime.timezone.utc
+ assert res["date"].dt.tz == utc
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_with_point_z(tmpdir, ext, driver, engine):
"""Test that 3D geometries are retained in writes (GH #612)."""
- pass
+
+ tempfilename = os.path.join(str(tmpdir), "test_3Dpoint" + ext)
+ point3d = Point(0, 0, 500)
+ point2d = Point(1, 1)
+ df = GeoDataFrame({"a": [1, 2]}, geometry=[point3d, point2d], crs=_CRS)
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert_geoseries_equal(df.geometry, df_read.geometry)
+ # check the expected driver
+ assert_correct_driver(tempfilename, ext, engine)
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_to_file_with_poly_z(tmpdir, ext, driver, engine):
"""Test that 3D geometries are retained in writes (GH #612)."""
- pass
+
+ tempfilename = os.path.join(str(tmpdir), "test_3Dpoly" + ext)
+ poly3d = Polygon([[0, 0, 5], [0, 1, 5], [1, 1, 5], [1, 0, 5]])
+ poly2d = Polygon([[0, 0], [0, 1], [1, 1], [1, 0]])
+ df = GeoDataFrame({"a": [1, 2]}, geometry=[poly3d, poly2d], crs=_CRS)
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert_geoseries_equal(df.geometry, df_read.geometry)
+ # check the expected driver
+ assert_correct_driver(tempfilename, ext, engine)
def test_to_file_types(tmpdir, df_points, engine):
"""Test various integer type columns (GH#93)"""
- pass
+ tempfilename = os.path.join(str(tmpdir), "int.shp")
+ int_types = [
+ np.int8,
+ np.int16,
+ np.int32,
+ np.int64,
+ np.intp,
+ np.uint8,
+ np.uint16,
+ np.uint32,
+ np.uint64,
+ ]
+ geometry = df_points.geometry
+ data = {
+ str(i): np.arange(len(geometry), dtype=dtype)
+ for i, dtype in enumerate(int_types)
+ }
+ df = GeoDataFrame(data, geometry=geometry)
+ df.to_file(tempfilename, engine=engine)
+
+
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs + [("OGR_GMT", ".gmt")])
+def test_to_file_int32(tmpdir, df_points, engine, driver, ext):
+ tempfilename = os.path.join(str(tmpdir), f"int32.{ext}")
+ geometry = df_points.geometry
+ df = GeoDataFrame(geometry=geometry)
+ df["data"] = pd.array([1, np.nan] * 5, dtype=pd.Int32Dtype())
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
+ # the int column with missing values comes back as float
+ expected = df.copy()
+ expected["data"] = expected["data"].astype("float64")
+ assert_geodataframe_equal(df_read, expected, check_like=True)
+
+ tempfilename2 = os.path.join(str(tmpdir), f"int32_2.{ext}")
+ df2 = df.dropna()
+ df2.to_file(tempfilename2, driver=driver, engine=engine)
+ df2_read = GeoDataFrame.from_file(tempfilename2, engine=engine)
+ if engine == "pyogrio":
+ assert df2_read["data"].dtype == "int32"
+ else:
+ # with the fiona engine the 32 bitwidth is not preserved
+ assert df2_read["data"].dtype == "int64"
+
+
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
+def test_to_file_int64(tmpdir, df_points, engine, driver, ext):
+ tempfilename = os.path.join(str(tmpdir), f"int64.{ext}")
+ geometry = df_points.geometry
+ df = GeoDataFrame(geometry=geometry)
+ df["data"] = pd.array([1, np.nan] * 5, dtype=pd.Int64Dtype())
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ df_read = GeoDataFrame.from_file(tempfilename, engine=engine)
+ # the int column with missing values comes back as float
+ expected = df.copy()
+ expected["data"] = expected["data"].astype("float64")
+ assert_geodataframe_equal(df_read, expected, check_like=True)
+
+
+def test_to_file_empty(tmpdir, engine):
+ input_empty_df = GeoDataFrame(columns=["geometry"])
+ tempfilename = os.path.join(str(tmpdir), "test.shp")
+ with pytest.warns(UserWarning):
+ input_empty_df.to_file(tempfilename, engine=engine)
def test_to_file_schema(tmpdir, df_nybb, engine):
@@ -102,16 +421,61 @@ def test_to_file_schema(tmpdir, df_nybb, engine):
if it is specified
"""
- pass
+ tempfilename = os.path.join(str(tmpdir), "test.shp")
+ properties = OrderedDict(
+ [
+ ("Shape_Leng", "float:19.11"),
+ ("BoroName", "str:40"),
+ ("BoroCode", "int:10"),
+ ("Shape_Area", "float:19.11"),
+ ]
+ )
+ schema = {"geometry": "Polygon", "properties": properties}
+
+ if engine == "pyogrio":
+ with pytest.raises(ValueError):
+ df_nybb.iloc[:2].to_file(tempfilename, schema=schema, engine=engine)
+ else:
+ # Take the first 2 features to speed things up a bit
+ df_nybb.iloc[:2].to_file(tempfilename, schema=schema, engine=engine)
+
+ import fiona
+ with fiona.open(tempfilename) as f:
+ result_schema = f.schema
-@pytest.mark.skipif(not HAS_PYPROJ, reason='pyproj not installed')
+ assert result_schema == schema
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_to_file_crs(tmpdir, engine, nybb_filename):
"""
Ensure that the file is written according to the crs
if it is specified
"""
- pass
+ df = read_file(nybb_filename, engine=engine)
+ tempfilename = os.path.join(str(tmpdir), "crs.shp")
+
+ # save correct CRS
+ df.to_file(tempfilename, engine=engine)
+ result = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert result.crs == df.crs
+
+ if engine == "pyogrio":
+ with pytest.raises(ValueError, match="Passing 'crs' is not supported"):
+ df.to_file(tempfilename, crs=3857, engine=engine)
+ return
+
+ # overwrite CRS
+ df.to_file(tempfilename, crs=3857, engine=engine)
+ result = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert result.crs == "epsg:3857"
+
+ # specify CRS for gdf without one
+ df2 = df.set_crs(None, allow_override=True)
+ df2.to_file(tempfilename, crs=2263, engine=engine)
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert df.crs == "epsg:2263"
def test_to_file_column_len(tmpdir, df_points, engine):
@@ -119,28 +483,643 @@ def test_to_file_column_len(tmpdir, df_points, engine):
Ensure that a warning about truncation is given when a geodataframe with
column names longer than 10 characters is saved to shapefile
"""
- pass
+ tempfilename = os.path.join(str(tmpdir), "test.shp")
+
+ df = df_points.iloc[:1].copy()
+ df["0123456789A"] = ["the column name is 11 characters"]
+
+ with pytest.warns(
+ UserWarning, match="Column names longer than 10 characters will be truncated"
+ ):
+ df.to_file(tempfilename, driver="ESRI Shapefile", engine=engine)
+
+def test_to_file_with_duplicate_columns(tmpdir, engine):
+ df = GeoDataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"], geometry=[Point(1, 1)])
+ tempfilename = os.path.join(str(tmpdir), "duplicate.shp")
+ with pytest.raises(
+ ValueError, match="GeoDataFrame cannot contain duplicated column names."
+ ):
+ df.to_file(tempfilename, engine=engine)
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_append_file(tmpdir, df_nybb, df_null, driver, ext, engine):
"""Test to_file with append mode and from_file"""
- pass
+ tempfilename = os.path.join(str(tmpdir), "boros" + ext)
+ driver = driver if driver else _detect_driver(tempfilename)
+ df_nybb.to_file(tempfilename, driver=driver, engine=engine)
+ df_nybb.to_file(tempfilename, mode="a", driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert "geometry" in df
+ assert len(df) == (5 * 2)
+ expected = pd.concat([df_nybb] * 2, ignore_index=True)
+ assert_geodataframe_equal(df, expected, check_less_precise=True)
-@pytest.mark.filterwarnings("ignore:'crs' was not provided:UserWarning:pyogrio"
- )
-@pytest.mark.parametrize('driver,ext', driver_ext_pairs)
+ if engine == "pyogrio":
+ # for pyogrio also ensure append=True works
+ tempfilename = os.path.join(str(tmpdir), "boros2" + ext)
+ df_nybb.to_file(tempfilename, driver=driver, engine=engine)
+ df_nybb.to_file(tempfilename, append=True, driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert len(df) == (len(df_nybb) * 2)
+
+ # Write layer with null geometry out to file
+ tempfilename = os.path.join(str(tmpdir), "null_geom" + ext)
+ df_null.to_file(tempfilename, driver=driver, engine=engine)
+ df_null.to_file(tempfilename, mode="a", driver=driver, engine=engine)
+ # Read layer back in
+ df = GeoDataFrame.from_file(tempfilename, engine=engine)
+ assert "geometry" in df
+ assert len(df) == (2 * 2)
+ expected = pd.concat([df_null] * 2, ignore_index=True)
+ assert_geodataframe_equal(df, expected, check_less_precise=True)
+
+
+def test_mode_unsupported(tmpdir, df_nybb, engine):
+ tempfilename = os.path.join(str(tmpdir), "data.shp")
+ with pytest.raises(ValueError, match="'mode' should be one of 'w' or 'a'"):
+ df_nybb.to_file(tempfilename, mode="r", engine=engine)
+
+
+@pytest.mark.filterwarnings("ignore:'crs' was not provided:UserWarning:pyogrio")
+@pytest.mark.parametrize("driver,ext", driver_ext_pairs)
def test_empty_crs(tmpdir, driver, ext, engine):
"""Test handling of undefined CRS with GPKG driver (GH #1975)."""
- pass
+ if ext == ".gpkg":
+ pytest.xfail("GPKG is read with Undefined geographic SRS.")
+ tempfilename = os.path.join(str(tmpdir), "boros" + ext)
+ df = GeoDataFrame(
+ {
+ "a": [1.0, 2.0, 3.0],
+ "geometry": [Point(0, 0), Point(1, 1), Point(2, 2)],
+ },
+ )
-NYBB_CRS = 'epsg:2263'
+ df.to_file(tempfilename, driver=driver, engine=engine)
+ result = read_file(tempfilename, engine=engine)
+ if ext == ".geojson":
+ # geojson by default assumes epsg:4326
+ df.geometry.array.crs = "EPSG:4326"
+
+ assert_geodataframe_equal(result, df)
+
+
+# -----------------------------------------------------------------------------
+# read_file tests
+# -----------------------------------------------------------------------------
+
+
+NYBB_CRS = "epsg:2263"
+
+
+def test_read_file(engine, nybb_filename):
+ df = read_file(nybb_filename, engine=engine)
+ validate_boro_df(df)
+ if HAS_PYPROJ:
+ assert df.crs == NYBB_CRS
+ expected_columns = ["BoroCode", "BoroName", "Shape_Leng", "Shape_Area"]
+ assert (df.columns[:-1] == expected_columns).all()
+
+
+@pytest.mark.web
+@pytest.mark.parametrize(
+ "url",
+ [
+ # geojson url
+ "https://raw.githubusercontent.com/geopandas/geopandas/"
+ "main/geopandas/tests/data/null_geom.geojson",
+ # url to zip file
+ "https://raw.githubusercontent.com/geopandas/geopandas/"
+ "main/geopandas/tests/data/nybb_16a.zip",
+ # url to zipfile without extension
+ "https://geonode.goosocean.org/download/480",
+ # url to web service
+ "https://demo.pygeoapi.io/stable/collections/obs/items",
+ ],
+)
+def test_read_file_url(engine, url):
+ gdf = read_file(url, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+def test_read_file_local_uri(file_path, engine):
+ local_uri = "file://" + file_path
+ gdf = read_file(local_uri, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+def test_read_file_geojson_string_path(engine):
+ if engine == "pyogrio" and not PYOGRIO_GE_090:
+ pytest.skip("fixed in pyogrio 0.9.0")
+ expected = GeoDataFrame({"val_with_hash": ["row # 0"], "geometry": [Point(0, 1)]})
+ features = {
+ "type": "FeatureCollection",
+ "features": [
+ {
+ "type": "Feature",
+ "properties": {"val_with_hash": "row # 0"},
+ "geometry": {"type": "Point", "coordinates": [0.0, 1.0]},
+ }
+ ],
+ }
+ df_read = read_file(json.dumps(features))
+ assert_geodataframe_equal(expected.set_crs("EPSG:4326"), df_read)
+
+
+def test_read_file_textio(file_path, engine):
+ file_text_stream = open(file_path)
+ file_stringio = io.StringIO(open(file_path).read())
+ gdf_text_stream = read_file(file_text_stream, engine=engine)
+ gdf_stringio = read_file(file_stringio, engine=engine)
+ assert isinstance(gdf_text_stream, geopandas.GeoDataFrame)
+ assert isinstance(gdf_stringio, geopandas.GeoDataFrame)
+
+
+def test_read_file_bytesio(file_path, engine):
+ file_binary_stream = open(file_path, "rb")
+ file_bytesio = io.BytesIO(open(file_path, "rb").read())
+ gdf_binary_stream = read_file(file_binary_stream, engine=engine)
+ gdf_bytesio = read_file(file_bytesio, engine=engine)
+ assert isinstance(gdf_binary_stream, geopandas.GeoDataFrame)
+ assert isinstance(gdf_bytesio, geopandas.GeoDataFrame)
+
+
+def test_read_file_raw_stream(file_path, engine):
+ file_raw_stream = open(file_path, "rb", buffering=0)
+ gdf_raw_stream = read_file(file_raw_stream, engine=engine)
+ assert isinstance(gdf_raw_stream, geopandas.GeoDataFrame)
+
+
+def test_read_file_pathlib(file_path, engine):
+ path_object = pathlib.Path(file_path)
+ gdf_path_object = read_file(path_object, engine=engine)
+ assert isinstance(gdf_path_object, geopandas.GeoDataFrame)
+
+
+def test_read_file_tempfile(engine):
+ temp = tempfile.TemporaryFile()
+ temp.write(
+ b"""
+ {
+ "type": "Feature",
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ },
+ "properties": {
+ "name": "Null Island"
+ }
+ }
+ """
+ )
+ temp.seek(0)
+ gdf_tempfile = geopandas.read_file(temp, engine=engine)
+ assert isinstance(gdf_tempfile, geopandas.GeoDataFrame)
+ temp.close()
+
+
+def test_read_binary_file_fsspec(engine, nybb_filename):
+ fsspec = pytest.importorskip("fsspec")
+ # Remove the zip scheme so fsspec doesn't open as a zipped file,
+ # instead we want to read as bytes and let fiona decode it.
+ path = nybb_filename[6:]
+ with fsspec.open(path, "rb") as f:
+ gdf = read_file(f, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+def test_read_text_file_fsspec(file_path, engine):
+ fsspec = pytest.importorskip("fsspec")
+ with fsspec.open(file_path, "r") as f:
+ gdf = read_file(f, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+def test_infer_zipped_file(engine, nybb_filename):
+ # Remove the zip scheme so that the test for a zipped file can
+ # check it and add it back.
+ path = nybb_filename[6:]
+ gdf = read_file(path, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+ # Check that it can successfully add a zip scheme to a path that already has a
+ # scheme
+ gdf = read_file("file+file://" + path, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+ # Check that it can add a zip scheme for a path that includes a subpath
+ # within the archive.
+ gdf = read_file(path + "!nybb.shp", engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+def test_allow_legacy_gdal_path(engine, nybb_filename):
+ # Construct a GDAL-style zip path.
+ path = "/vsizip/" + nybb_filename[6:]
+ gdf = read_file(path, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+@pytest.mark.skipif(not PYOGRIO_GE_090, reason="bug fixed in pyogrio 0.9.0")
+def test_read_file_with_hash_in_path(engine, nybb_filename, tmp_path):
+ folder_with_hash = tmp_path / "path with # present"
+ folder_with_hash.mkdir(exist_ok=True, parents=True)
+ read_path = folder_with_hash / "nybb.zip"
+ shutil.copy(nybb_filename[6:], read_path)
+ gdf = read_file(read_path, engine=engine)
+ assert isinstance(gdf, geopandas.GeoDataFrame)
+
+
+def test_read_file_bbox_tuple(df_nybb, engine, nybb_filename):
+ bbox = (
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ filtered_df = read_file(nybb_filename, bbox=bbox, engine=engine)
+ expected = df_nybb[df_nybb["BoroName"].isin(["Bronx", "Queens"])]
+ assert_geodataframe_equal(filtered_df, expected.reset_index(drop=True))
+
+
+def test_read_file_bbox_polygon(df_nybb, engine, nybb_filename):
+ bbox = box(
+ 1031051.7879884212, 224272.49231459625, 1047224.3104931959, 244317.30894023244
+ )
+ filtered_df = read_file(nybb_filename, bbox=bbox, engine=engine)
+ expected = df_nybb[df_nybb["BoroName"].isin(["Bronx", "Queens"])]
+ assert_geodataframe_equal(filtered_df, expected.reset_index(drop=True))
+
+
+def test_read_file_filtered__rows(df_nybb, engine, nybb_filename):
+ filtered_df = read_file(nybb_filename, rows=1, engine=engine)
+ assert_geodataframe_equal(filtered_df, df_nybb.iloc[[0], :])
+
+
+def test_read_file_filtered__rows_slice(df_nybb, engine, nybb_filename):
+ filtered_df = read_file(nybb_filename, rows=slice(1, 3), engine=engine)
+ assert_geodataframe_equal(filtered_df, df_nybb.iloc[1:3, :].reset_index(drop=True))
+
+
+@pytest.mark.filterwarnings(
+ "ignore:Layer does not support OLC_FASTFEATURECOUNT:RuntimeWarning"
+) # for the slice with -1
+def test_read_file_filtered__rows_bbox(df_nybb, engine, nybb_filename):
+ bbox = (
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ if engine == "fiona":
+ # combination bbox and rows (rows slice applied after bbox filtering!)
+ filtered_df = read_file(
+ nybb_filename, bbox=bbox, rows=slice(4, None), engine=engine
+ )
+ assert filtered_df.empty
+
+ if engine == "pyogrio":
+ # TODO: support negative rows in pyogrio
+ with pytest.raises(
+ ValueError,
+ match="'skip_features' must be between 0 and 1|Negative slice start",
+ ):
+ filtered_df = read_file(
+ nybb_filename, bbox=bbox, rows=slice(-1, None), engine=engine
+ )
+ else:
+ filtered_df = read_file(
+ nybb_filename, bbox=bbox, rows=slice(-1, None), engine=engine
+ )
+ filtered_df["BoroCode"] = filtered_df["BoroCode"].astype("int64")
+ assert_geodataframe_equal(
+ filtered_df, df_nybb.iloc[4:, :].reset_index(drop=True)
+ )
+
+
+def test_read_file_filtered_rows_invalid(engine, nybb_filename):
+ with pytest.raises(TypeError):
+ read_file(nybb_filename, rows="not_a_slice", engine=engine)
+
+
+def test_read_file__ignore_geometry(engine, naturalearth_lowres):
+ pdf = geopandas.read_file(
+ naturalearth_lowres,
+ ignore_geometry=True,
+ engine=engine,
+ )
+ assert "geometry" not in pdf.columns
+ assert isinstance(pdf, pd.DataFrame) and not isinstance(pdf, geopandas.GeoDataFrame)
-class FileNumber(object):
+@pytest.mark.filterwarnings(
+ "ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
+)
+def test_read_file__ignore_fields(engine, naturalearth_lowres):
+ gdf = geopandas.read_file(
+ naturalearth_lowres,
+ ignore_fields=["pop_est", "continent", "iso_a3", "gdp_md_est"],
+ engine=engine,
+ )
+ assert gdf.columns.tolist() == ["name", "geometry"]
+
+
+@pytest.mark.filterwarnings(
+ "ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
+)
+def test_read_file__ignore_all_fields(engine, naturalearth_lowres):
+ gdf = geopandas.read_file(
+ naturalearth_lowres,
+ ignore_fields=["pop_est", "continent", "name", "iso_a3", "gdp_md_est"],
+ engine=engine,
+ )
+ assert gdf.columns.tolist() == ["geometry"]
+
+
+def test_read_file_missing_geometry(tmpdir, engine):
+ filename = str(tmpdir / "test.csv")
+
+ expected = pd.DataFrame(
+ {"col1": np.array([1, 2, 3], dtype="int64"), "col2": ["a", "b", "c"]}
+ )
+ expected.to_csv(filename, index=False)
+
+ df = geopandas.read_file(filename, engine=engine)
+ # both engines read integers as strings; force back to original type
+ df["col1"] = df["col1"].astype("int64")
+
+ assert isinstance(df, pd.DataFrame)
+ assert not isinstance(df, geopandas.GeoDataFrame)
+
+ assert_frame_equal(df, expected)
+
+
+def test_read_file_None_attribute(tmp_path, engine):
+ # Test added in context of https://github.com/geopandas/geopandas/issues/2901
+ test_path = tmp_path / "test.gpkg"
+ gdf = GeoDataFrame(
+ {"a": [None, None]}, geometry=[Point(1, 2), Point(3, 4)], crs=4326
+ )
+
+ gdf.to_file(test_path, engine=engine)
+ read_gdf = read_file(test_path, engine=engine)
+ assert_geodataframe_equal(gdf, read_gdf)
+
+
+def test_read_csv_dtype(tmpdir, df_nybb):
+ filename = str(tmpdir / "test.csv")
+
+ df_nybb.to_csv(filename, index=False)
+ pdf = pd.read_csv(filename, dtype={"geometry": "geometry"})
+
+ assert pdf.geometry.dtype == "geometry"
+
+
+def test_read_file__where_filter(engine, naturalearth_lowres):
+ if FIONA_GE_19 or engine == "pyogrio":
+ gdf = geopandas.read_file(
+ naturalearth_lowres,
+ where="continent='Africa'",
+ engine=engine,
+ )
+ assert gdf.continent.unique().tolist() == ["Africa"]
+ else:
+ with pytest.raises(NotImplementedError):
+ geopandas.read_file(
+ naturalearth_lowres,
+ where="continent='Africa'",
+ engine="fiona",
+ )
+
+
+def test_read_file__columns(engine, naturalearth_lowres):
+ if engine == "fiona" and not FIONA_GE_19:
+ pytest.skip("columns requires fiona 1.9+")
+
+ gdf = geopandas.read_file(
+ naturalearth_lowres, columns=["name", "pop_est"], engine=engine
+ )
+ assert gdf.columns.tolist() == ["name", "pop_est", "geometry"]
+
+
+def test_read_file__columns_empty(engine, naturalearth_lowres):
+ if engine == "fiona" and not FIONA_GE_19:
+ pytest.skip("columns requires fiona 1.9+")
+
+ gdf = geopandas.read_file(naturalearth_lowres, columns=[], engine=engine)
+ assert gdf.columns.tolist() == ["geometry"]
+
+
+@pytest.mark.skipif(FIONA_GE_19 or not fiona, reason="test for fiona < 1.9")
+def test_read_file__columns_old_fiona(naturalearth_lowres):
+ with pytest.raises(NotImplementedError):
+ geopandas.read_file(
+ naturalearth_lowres, columns=["name", "pop_est"], engine="fiona"
+ )
+
+
+@pytest.mark.filterwarnings(
+ "ignore:The 'include_fields' and 'ignore_fields' keywords:DeprecationWarning"
+)
+def test_read_file__include_fields(engine, naturalearth_lowres):
+ if engine == "fiona" and not FIONA_GE_19:
+ pytest.skip("columns requires fiona 1.9+")
+
+ gdf = geopandas.read_file(
+ naturalearth_lowres, include_fields=["name", "pop_est"], engine=engine
+ )
+ assert gdf.columns.tolist() == ["name", "pop_est", "geometry"]
+
+
+@pytest.mark.skipif(not FIONA_GE_19, reason="columns requires fiona 1.9+")
+def test_read_file__columns_conflicting_keywords(engine, naturalearth_lowres):
+ path = naturalearth_lowres
+
+ with pytest.raises(ValueError, match="Cannot specify both"):
+ geopandas.read_file(
+ path, include_fields=["name"], ignore_fields=["pop_est"], engine=engine
+ )
+
+ with pytest.raises(ValueError, match="Cannot specify both"):
+ geopandas.read_file(
+ path, columns=["name"], include_fields=["pop_est"], engine=engine
+ )
+
+ with pytest.raises(ValueError, match="Cannot specify both"):
+ geopandas.read_file(
+ path, columns=["name"], ignore_fields=["pop_est"], engine=engine
+ )
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+@pytest.mark.parametrize("file_like", [False, True])
+def test_read_file_bbox_gdf(df_nybb, engine, nybb_filename, file_like):
+ full_df_shape = df_nybb.shape
+ bbox = geopandas.GeoDataFrame(
+ geometry=[
+ box(
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ ],
+ crs=NYBB_CRS,
+ )
+ infile = (
+ open(nybb_filename.replace("zip://", ""), "rb") if file_like else nybb_filename
+ )
+ filtered_df = read_file(infile, bbox=bbox, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+@pytest.mark.parametrize("file_like", [False, True])
+def test_read_file_mask_gdf(df_nybb, engine, nybb_filename, file_like):
+ full_df_shape = df_nybb.shape
+ mask = geopandas.GeoDataFrame(
+ geometry=[
+ box(
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ ],
+ crs=NYBB_CRS,
+ )
+ infile = (
+ open(nybb_filename.replace("zip://", ""), "rb") if file_like else nybb_filename
+ )
+ filtered_df = read_file(infile, mask=mask, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+def test_read_file_mask_polygon(df_nybb, engine, nybb_filename):
+ full_df_shape = df_nybb.shape
+ mask = box(
+ 1031051.7879884212, 224272.49231459625, 1047224.3104931959, 244317.30894023244
+ )
+ filtered_df = read_file(nybb_filename, mask=mask, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+def test_read_file_mask_geojson(df_nybb, nybb_filename, engine):
+ full_df_shape = df_nybb.shape
+ mask = mapping(
+ box(
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ )
+ filtered_df = read_file(nybb_filename, mask=mask, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+def test_read_file_bbox_gdf_mismatched_crs(df_nybb, engine, nybb_filename):
+ full_df_shape = df_nybb.shape
+ bbox = geopandas.GeoDataFrame(
+ geometry=[
+ box(
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ ],
+ crs=NYBB_CRS,
+ )
+ bbox.to_crs(epsg=4326, inplace=True)
+ filtered_df = read_file(nybb_filename, bbox=bbox, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+def test_read_file_mask_gdf_mismatched_crs(df_nybb, engine, nybb_filename):
+ full_df_shape = df_nybb.shape
+ mask = geopandas.GeoDataFrame(
+ geometry=[
+ box(
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+ ],
+ crs=NYBB_CRS,
+ )
+ mask.to_crs(epsg=4326, inplace=True)
+ filtered_df = read_file(nybb_filename, mask=mask.geometry, engine=engine)
+ filtered_df_shape = filtered_df.shape
+ assert full_df_shape != filtered_df_shape
+ assert filtered_df_shape == (2, 5)
+
+
+def test_read_file_bbox_mask_not_allowed(engine, nybb_filename):
+ bbox = (
+ 1031051.7879884212,
+ 224272.49231459625,
+ 1047224.3104931959,
+ 244317.30894023244,
+ )
+
+ mask = box(*bbox)
+
+ with pytest.raises(ValueError, match="mask and bbox can not be set together"):
+ read_file(nybb_filename, bbox=bbox, mask=mask)
+
+
+@pytest.mark.filterwarnings(
+ "ignore:Layer 'b'test_empty'' does not have any features:UserWarning"
+)
+def test_read_file_empty_shapefile(tmpdir, engine):
+ if engine == "pyogrio" and not fiona:
+ pytest.skip("test requires fiona to work")
+ from geopandas.io.file import fiona_env
+
+ # create empty shapefile
+ meta = {
+ "crs": {},
+ "crs_wkt": "",
+ "driver": "ESRI Shapefile",
+ "schema": {
+ "geometry": "Point",
+ "properties": OrderedDict([("A", "int:9"), ("Z", "float:24.15")]),
+ },
+ }
+
+ fname = str(tmpdir.join("test_empty.shp"))
+
+ with fiona_env():
+ with fiona.open(fname, "w", **meta) as _:
+ pass
+
+ empty = read_file(fname, engine=engine)
+ assert isinstance(empty, geopandas.GeoDataFrame)
+ assert all(empty.columns == ["A", "Z", "geometry"])
+
+
+class FileNumber(object):
def __init__(self, tmpdir, base, ext):
self.tmpdir = str(tmpdir)
self.base = base
@@ -148,9 +1127,312 @@ class FileNumber(object):
self.fileno = 0
def __repr__(self):
- filename = '{0}{1:02d}.{2}'.format(self.base, self.fileno, self.ext)
+ filename = "{0}{1:02d}.{2}".format(self.base, self.fileno, self.ext)
return os.path.join(self.tmpdir, filename)
def __next__(self):
self.fileno += 1
return repr(self)
+
+
+@pytest.mark.parametrize(
+ "driver,ext", [("ESRI Shapefile", "shp"), ("GeoJSON", "geojson")]
+)
+def test_write_index_to_file(tmpdir, df_points, driver, ext, engine):
+ fngen = FileNumber(tmpdir, "check", ext)
+
+ def do_checks(df, index_is_used):
+ # check combinations of index=None|True|False on GeoDataFrame/GeoSeries
+ other_cols = list(df.columns)
+ other_cols.remove("geometry")
+
+ if driver == "ESRI Shapefile":
+ # ESRI Shapefile will add FID if no other columns exist
+ driver_col = ["FID"]
+ else:
+ driver_col = []
+
+ if index_is_used:
+ index_cols = list(df.index.names)
+ else:
+ index_cols = [None] * len(df.index.names)
+
+ # replicate pandas' default index names for regular and MultiIndex
+ if index_cols == [None]:
+ index_cols = ["index"]
+ elif len(index_cols) > 1 and not all(index_cols):
+ for level, index_col in enumerate(index_cols):
+ if index_col is None:
+ index_cols[level] = "level_" + str(level)
+
+ # check GeoDataFrame with default index=None to autodetect
+ tempfilename = next(fngen)
+ df.to_file(tempfilename, driver=driver, index=None, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ if len(other_cols) == 0:
+ expected_cols = driver_col[:]
+ else:
+ expected_cols = []
+ if index_is_used:
+ expected_cols += index_cols
+ expected_cols += other_cols + ["geometry"]
+ assert list(df_check.columns) == expected_cols
+
+ # similar check on GeoSeries with index=None
+ tempfilename = next(fngen)
+ df.geometry.to_file(tempfilename, driver=driver, index=None, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ if index_is_used:
+ expected_cols = index_cols + ["geometry"]
+ else:
+ expected_cols = driver_col + ["geometry"]
+ assert list(df_check.columns) == expected_cols
+
+ # check GeoDataFrame with index=True
+ tempfilename = next(fngen)
+ df.to_file(tempfilename, driver=driver, index=True, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ assert list(df_check.columns) == index_cols + other_cols + ["geometry"]
+
+ # similar check on GeoSeries with index=True
+ tempfilename = next(fngen)
+ df.geometry.to_file(tempfilename, driver=driver, index=True, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ assert list(df_check.columns) == index_cols + ["geometry"]
+
+ # check GeoDataFrame with index=False
+ tempfilename = next(fngen)
+ df.to_file(tempfilename, driver=driver, index=False, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ if len(other_cols) == 0:
+ expected_cols = driver_col + ["geometry"]
+ else:
+ expected_cols = other_cols + ["geometry"]
+ assert list(df_check.columns) == expected_cols
+
+ # similar check on GeoSeries with index=False
+ tempfilename = next(fngen)
+ df.geometry.to_file(tempfilename, driver=driver, index=False, engine=engine)
+ df_check = read_file(tempfilename, engine=engine)
+ assert list(df_check.columns) == driver_col + ["geometry"]
+
+ #
+ # Checks where index is not used/saved
+ #
+
+ # index is a default RangeIndex
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ do_checks(df, index_is_used=False)
+
+ # index is a RangeIndex, starting from 1
+ df.index += 1
+ do_checks(df, index_is_used=False)
+
+ # index is a Int64Index regular sequence from 1
+ df_p.index = list(range(1, len(df) + 1))
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ do_checks(df, index_is_used=False)
+
+ # index was a default RangeIndex, but delete one row to make an Int64Index
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry).drop(5, axis=0)
+ do_checks(df, index_is_used=False)
+
+ # no other columns (except geometry)
+ df = GeoDataFrame(geometry=df_p.geometry)
+ do_checks(df, index_is_used=False)
+
+ #
+ # Checks where index is used/saved
+ #
+
+ # named index
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ df.index.name = "foo_index"
+ do_checks(df, index_is_used=True)
+
+ # named index, same as pandas' default name after .reset_index(drop=False)
+ df.index.name = "index"
+ do_checks(df, index_is_used=True)
+
+ # named MultiIndex
+ df_p = df_points.copy()
+ df_p["value3"] = df_p["value2"] - df_p["value1"]
+ df_p.set_index(["value1", "value2"], inplace=True)
+ df = GeoDataFrame(df_p, geometry=df_p.geometry)
+ do_checks(df, index_is_used=True)
+
+ # partially unnamed MultiIndex
+ df.index.names = ["first", None]
+ do_checks(df, index_is_used=True)
+
+ # unnamed MultiIndex
+ df.index.names = [None, None]
+ do_checks(df, index_is_used=True)
+
+ # unnamed Float64Index
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ df.index = df_p.index.astype(float) / 10
+ do_checks(df, index_is_used=True)
+
+ # named Float64Index
+ df.index.name = "centile"
+ do_checks(df, index_is_used=True)
+
+ # index as string
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ df.index = pd.to_timedelta(range(len(df)), unit="days")
+ # TODO: TimedeltaIndex is an invalid field type
+ df.index = df.index.astype(str)
+ do_checks(df, index_is_used=True)
+
+ # unnamed DatetimeIndex
+ df_p = df_points.copy()
+ df = GeoDataFrame(df_p["value1"], geometry=df_p.geometry)
+ df.index = pd.to_timedelta(range(len(df)), unit="days") + pd.to_datetime(
+ ["1999-12-27"] * len(df)
+ )
+ if driver == "ESRI Shapefile":
+ # Shapefile driver does not support datetime fields
+ df.index = df.index.astype(str)
+ do_checks(df, index_is_used=True)
+
+ # named DatetimeIndex
+ df.index.name = "datetime"
+ do_checks(df, index_is_used=True)
+
+
+def test_to_file__undetermined_driver(tmp_path, df_nybb):
+ shpdir = tmp_path / "boros.invalid"
+ df_nybb.to_file(shpdir)
+ assert shpdir.is_dir()
+ assert list(shpdir.glob("*.shp"))
+
+
+@pytest.mark.parametrize(
+ "test_file", [(pathlib.Path("~/test_file.geojson")), "~/test_file.geojson"]
+)
+def test_write_read_file(test_file, engine):
+ gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs=_CRS)
+ gdf.to_file(test_file, driver="GeoJSON")
+ df_json = geopandas.read_file(test_file, engine=engine)
+ assert_geodataframe_equal(gdf, df_json, check_crs=True)
+ os.remove(os.path.expanduser(test_file))
+
+
+@pytest.mark.skipif(fiona is False, reason="Fiona not available")
+@pytest.mark.skipif(FIONA_GE_19, reason="Fiona >= 1.9 supports metadata")
+def test_to_file_metadata_unsupported_fiona_version(tmp_path, df_points):
+ metadata = {"title": "test"}
+ tmp_file = tmp_path / "test.gpkg"
+ match = "'metadata' keyword is only supported for Fiona >= 1.9"
+ with pytest.raises(NotImplementedError, match=match):
+ df_points.to_file(tmp_file, driver="GPKG", engine="fiona", metadata=metadata)
+
+
+@pytest.mark.skipif(not FIONA_GE_19, reason="only Fiona >= 1.9 supports metadata")
+def test_to_file_metadata_supported_fiona_version(tmp_path, df_points):
+ metadata = {"title": "test"}
+ tmp_file = tmp_path / "test.gpkg"
+
+ df_points.to_file(tmp_file, driver="GPKG", engine="fiona", metadata=metadata)
+
+ # Check that metadata is written to the file
+ with fiona.open(tmp_file) as src:
+ tags = src.tags()
+ assert tags == metadata
+
+
+@pytest.mark.skipif(pyogrio is False, reason="Pyogrio not available")
+def test_to_file_metadata_pyogrio(tmp_path, df_points):
+ metadata = {"title": "test"}
+ tmp_file = tmp_path / "test.gpkg"
+
+ df_points.to_file(tmp_file, driver="GPKG", engine="pyogrio", metadata=metadata)
+
+ # Check that metadata is written to the file
+ info = pyogrio.read_info(tmp_file)
+ layer_metadata = info["layer_metadata"]
+ assert layer_metadata == metadata
+
+
+@pytest.mark.parametrize(
+ "driver, ext", [("ESRI Shapefile", ".shp"), ("GeoJSON", ".geojson")]
+)
+def test_to_file_metadata_unsupported_driver(driver, ext, tmpdir, df_points, engine):
+ metadata = {"title": "Test"}
+ tempfilename = os.path.join(str(tmpdir), "test" + ext)
+ with pytest.raises(
+ NotImplementedError, match="'metadata' keyword is only supported for"
+ ):
+ df_points.to_file(tempfilename, driver=driver, metadata=metadata)
+
+
+def test_multiple_geom_cols_error(tmpdir, df_nybb):
+ df_nybb["geom2"] = df_nybb.geometry
+ with pytest.raises(ValueError, match="GeoDataFrame contains multiple geometry"):
+ df_nybb.to_file(os.path.join(str(tmpdir), "boros.gpkg"))
+
+
+@PYOGRIO_MARK
+@FIONA_MARK
+def test_option_io_engine(nybb_filename):
+ try:
+ geopandas.options.io_engine = "pyogrio"
+
+ # disallowing to read a Shapefile with fiona should ensure we are
+ # actually reading with pyogrio
+ import fiona
+
+ orig = fiona.supported_drivers["ESRI Shapefile"]
+ fiona.supported_drivers["ESRI Shapefile"] = "w"
+
+ _ = geopandas.read_file(nybb_filename)
+ finally:
+ fiona.supported_drivers["ESRI Shapefile"] = orig
+ geopandas.options.io_engine = None
+
+
+@pytest.mark.skipif(pyogrio, reason="test for pyogrio not installed")
+def test_error_engine_unavailable_pyogrio(tmp_path, df_points, file_path):
+
+ with pytest.raises(ImportError, match="the 'read_file' function requires"):
+ geopandas.read_file(file_path, engine="pyogrio")
+
+ with pytest.raises(ImportError, match="the 'to_file' method requires"):
+ df_points.to_file(tmp_path / "test.gpkg", engine="pyogrio")
+
+
+@pytest.mark.skipif(fiona, reason="test for fiona not installed")
+def test_error_engine_unavailable_fiona(tmp_path, df_points, file_path):
+
+ with pytest.raises(ImportError, match="the 'read_file' function requires"):
+ geopandas.read_file(file_path, engine="fiona")
+
+ with pytest.raises(ImportError, match="the 'to_file' method requires"):
+ df_points.to_file(tmp_path / "test.gpkg", engine="fiona")
+
+
+@PYOGRIO_MARK
+def test_list_layers(df_points, tmpdir):
+ tempfilename = os.path.join(str(tmpdir), "dataset.gpkg")
+ df_points.to_file(tempfilename, layer="original")
+ df_points.set_geometry(df_points.buffer(1)).to_file(tempfilename, layer="buffered")
+ df_points.set_geometry(df_points.buffer(2).boundary).to_file(
+ tempfilename, layer="boundary"
+ )
+ pyogrio.write_dataframe(
+ df_points[["value1", "value2"]], tempfilename, layer="non-spatial"
+ )
+ layers = geopandas.list_layers(tempfilename)
+ expected = pd.DataFrame(
+ {
+ "name": ["original", "buffered", "boundary", "non-spatial"],
+ "geometry_type": ["Point", "Polygon", "LineString", None],
+ }
+ )
+ assert_frame_equal(layers, expected)
diff --git a/geopandas/io/tests/test_file_geom_types_drivers.py b/geopandas/io/tests/test_file_geom_types_drivers.py
index b81ff5f8..b28260fb 100644
--- a/geopandas/io/tests/test_file_geom_types_drivers.py
+++ b/geopandas/io/tests/test_file_geom_types_drivers.py
@@ -1,94 +1,328 @@
import os
-from shapely.geometry import LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon
+
+from shapely.geometry import (
+ LineString,
+ MultiLineString,
+ MultiPoint,
+ MultiPolygon,
+ Point,
+ Polygon,
+)
+
import geopandas
from geopandas import GeoDataFrame
+
from .test_file import FIONA_MARK, PYOGRIO_MARK
+
import pytest
from geopandas.testing import assert_geodataframe_equal
-city_hall_boundaries = Polygon(((-73.5541107525234, 45.5091983609661), (-
- 73.5546126200639, 45.5086813829106), (-73.5540185061397,
- 45.5084409343852), (-73.5539986525799, 45.5084323044531), (-
- 73.5535801792994, 45.5089539203786), (-73.5541107525234, 45.5091983609661))
+
+# Credit: Polygons below come from Montreal city Open Data portal
+# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
+city_hall_boundaries = Polygon(
+ (
+ (-73.5541107525234, 45.5091983609661),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5540185061397, 45.5084409343852),
+ (-73.5539986525799, 45.5084323044531),
+ (-73.5535801792994, 45.5089539203786),
+ (-73.5541107525234, 45.5091983609661),
+ )
+)
+vauquelin_place = Polygon(
+ (
+ (-73.5542465586147, 45.5081555487952),
+ (-73.5540185061397, 45.5084409343852),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5548825850032, 45.5084033554357),
+ (-73.5542465586147, 45.5081555487952),
)
-vauquelin_place = Polygon(((-73.5542465586147, 45.5081555487952), (-
- 73.5540185061397, 45.5084409343852), (-73.5546126200639,
- 45.5086813829106), (-73.5548825850032, 45.5084033554357), (-
- 73.5542465586147, 45.5081555487952)))
-city_hall_walls = [LineString(((-73.5541107525234, 45.5091983609661), (-
- 73.5546126200639, 45.5086813829106), (-73.5540185061397,
- 45.5084409343852))), LineString(((-73.5539986525799, 45.5084323044531),
- (-73.5535801792994, 45.5089539203786), (-73.5541107525234,
- 45.5091983609661)))]
+)
+
+city_hall_walls = [
+ LineString(
+ (
+ (-73.5541107525234, 45.5091983609661),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5540185061397, 45.5084409343852),
+ )
+ ),
+ LineString(
+ (
+ (-73.5539986525799, 45.5084323044531),
+ (-73.5535801792994, 45.5089539203786),
+ (-73.5541107525234, 45.5091983609661),
+ )
+ ),
+]
+
city_hall_entrance = Point(-73.553785, 45.508722)
-city_hall_balcony = Point(-73.554138, 45.50908)
+city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
+
point_3D = Point(-73.553785, 45.508722, 300)
-class _ExpectedError:
+# *****************************************
+# TEST TOOLING
+
+class _ExpectedError:
def __init__(self, error_type, error_message_match):
self.type = error_type
self.match = error_message_match
class _ExpectedErrorBuilder:
-
def __init__(self, composite_key):
self.composite_key = composite_key
+ def to_raise(self, error_type, error_match):
+ _expected_exceptions[self.composite_key] = _ExpectedError(
+ error_type, error_match
+ )
+
+def _expect_writing(gdf, ogr_driver):
+ return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
+
+
+def _composite_key(gdf, ogr_driver):
+ return frozenset([id(gdf), ogr_driver])
+
+
+def _expected_error_on(gdf, ogr_driver):
+ composite_key = _composite_key(gdf, ogr_driver)
+ return _expected_exceptions.get(composite_key, None)
+
+
+# *****************************************
+# TEST CASES
_geodataframes_to_write = []
_expected_exceptions = {}
-_CRS = 'epsg:4326'
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[city_hall_entrance,
- city_hall_balcony])
+_CRS = "epsg:4326"
+
+# ------------------
+# gdf with Points
+gdf = GeoDataFrame(
+ {"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[MultiPoint([
- city_hall_balcony, city_hall_council_chamber]), MultiPoint([
- city_hall_entrance, city_hall_balcony, city_hall_council_chamber])])
+
+# ------------------
+# gdf with MultiPoints
+gdf = GeoDataFrame(
+ {"a": [1, 2]},
+ crs=_CRS,
+ geometry=[
+ MultiPoint([city_hall_balcony, city_hall_council_chamber]),
+ MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
+ ],
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[MultiPoint([
- city_hall_entrance, city_hall_balcony]), city_hall_balcony])
+
+# ------------------
+# gdf with Points and MultiPoints
+gdf = GeoDataFrame(
+ {"a": [1, 2]},
+ crs=_CRS,
+ geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
+)
_geodataframes_to_write.append(gdf)
-_expect_writing(gdf, 'ESRI Shapefile').to_raise(RuntimeError,
- 'Failed to write record')
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=city_hall_walls)
+# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
+# Polygon/MultiPolygon but does not mention Point/MultiPoint
+# see https://www.gdal.org/drv_shapefile.html
+_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
+
+# ------------------
+# gdf with LineStrings
+gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[MultiLineString(
- city_hall_walls), MultiLineString(city_hall_walls)])
+
+# ------------------
+# gdf with MultiLineStrings
+gdf = GeoDataFrame(
+ {"a": [1, 2]},
+ crs=_CRS,
+ geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[MultiLineString(
- city_hall_walls), city_hall_walls[0]])
+
+# ------------------
+# gdf with LineStrings and MultiLineStrings
+gdf = GeoDataFrame(
+ {"a": [1, 2]},
+ crs=_CRS,
+ geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries,
- vauquelin_place])
+
+# ------------------
+# gdf with Polygons
+gdf = GeoDataFrame(
+ {"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1]}, crs=_CRS, geometry=[MultiPolygon((
- city_hall_boundaries, vauquelin_place))])
+
+# ------------------
+# gdf with MultiPolygon
+gdf = GeoDataFrame(
+ {"a": [1]},
+ crs=_CRS,
+ geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[MultiPolygon((
- city_hall_boundaries, vauquelin_place)), city_hall_boundaries])
+
+# ------------------
+# gdf with Polygon and MultiPolygon
+gdf = GeoDataFrame(
+ {"a": [1, 2]},
+ crs=_CRS,
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_boundaries,
+ ],
+)
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance]
- )
+
+# ------------------
+# gdf with null geometry and Point
+gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[None, point_3D])
+
+# ------------------
+# gdf with null geometry and 3D Point
+gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2]}, crs=_CRS, geometry=[None, None])
+
+# ------------------
+# gdf with null geometries only
+gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
_geodataframes_to_write.append(gdf)
-gdf = GeoDataFrame({'a': [1, 2, 3, 4, 5, 6]}, crs=_CRS, geometry=[
- MultiPolygon((city_hall_boundaries, vauquelin_place)),
- city_hall_entrance, MultiLineString(city_hall_walls), city_hall_walls[0
- ], MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony])
+
+# ------------------
+# gdf with all shape types mixed together
+gdf = GeoDataFrame(
+ {"a": [1, 2, 3, 4, 5, 6]},
+ crs=_CRS,
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_entrance,
+ MultiLineString(city_hall_walls),
+ city_hall_walls[0],
+ MultiPoint([city_hall_entrance, city_hall_balcony]),
+ city_hall_balcony,
+ ],
+)
_geodataframes_to_write.append(gdf)
-_expect_writing(gdf, 'ESRI Shapefile').to_raise(RuntimeError,
- 'Failed to write record')
-gdf = GeoDataFrame({'a': [1, 2, 3, 4, 5, 6, 7]}, crs=_CRS, geometry=[
- MultiPolygon((city_hall_boundaries, vauquelin_place)),
- city_hall_entrance, MultiLineString(city_hall_walls), city_hall_walls[0
- ], MultiPoint([city_hall_entrance, city_hall_balcony]),
- city_hall_balcony, point_3D])
+# Not supported by 'ESRI Shapefile' driver
+_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
+
+# ------------------
+# gdf with all 2D shape types and 3D Point mixed together
+gdf = GeoDataFrame(
+ {"a": [1, 2, 3, 4, 5, 6, 7]},
+ crs=_CRS,
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_entrance,
+ MultiLineString(city_hall_walls),
+ city_hall_walls[0],
+ MultiPoint([city_hall_entrance, city_hall_balcony]),
+ city_hall_balcony,
+ point_3D,
+ ],
+)
_geodataframes_to_write.append(gdf)
-_expect_writing(gdf, 'ESRI Shapefile').to_raise(RuntimeError,
- 'Failed to write record')
+# Not supported by 'ESRI Shapefile' driver
+_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
+
+
+@pytest.fixture(params=_geodataframes_to_write)
+def geodataframe(request):
+ return request.param
+
+
+@pytest.fixture(
+ params=[
+ ("GeoJSON", ".geojson"),
+ ("ESRI Shapefile", ".shp"),
+ ("GPKG", ".gpkg"),
+ ("SQLite", ".sqlite"),
+ ]
+)
+def ogr_driver(request):
+ return request.param
+
+
+@pytest.fixture(
+ params=[
+ pytest.param("fiona", marks=FIONA_MARK),
+ pytest.param("pyogrio", marks=PYOGRIO_MARK),
+ ]
+)
+def engine(request):
+ return request.param
+
+
+def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
+ driver, ext = ogr_driver
+ output_file = os.path.join(str(tmpdir), "output_file" + ext)
+ write_kwargs = {}
+ if driver == "SQLite":
+ write_kwargs["spatialite"] = True
+
+ # This if statement can be removed once minimal fiona version >= 1.8.20
+ if engine == "fiona":
+ from packaging.version import Version
+
+ import fiona
+
+ if Version(fiona.__version__) < Version("1.8.20"):
+ pytest.skip("SQLite driver only available from version 1.8.20")
+
+ # If only 3D Points, geometry_type needs to be specified for spatialite at the
+ # moment. This if can be removed once the following PR is released:
+ # https://github.com/geopandas/pyogrio/pull/223
+ if (
+ engine == "pyogrio"
+ and len(geodataframe == 2)
+ and geodataframe.geometry[0] is None
+ and geodataframe.geometry[1] is not None
+ and geodataframe.geometry[1].has_z
+ ):
+ write_kwargs["geometry_type"] = "Point Z"
+
+ expected_error = _expected_error_on(geodataframe, driver)
+ if expected_error:
+ with pytest.raises(
+ RuntimeError, match="Failed to write record|Could not add feature to layer"
+ ):
+ geodataframe.to_file(
+ output_file, driver=driver, engine=engine, **write_kwargs
+ )
+ else:
+ if driver == "SQLite" and engine == "pyogrio":
+ try:
+ geodataframe.to_file(
+ output_file, driver=driver, engine=engine, **write_kwargs
+ )
+ except ValueError as e:
+ if "unrecognized option 'SPATIALITE'" in str(e):
+ pytest.xfail(
+ "pyogrio wheels from PyPI do not come with SpatiaLite support. "
+ f"Error: {e}"
+ )
+ raise
+ else:
+ geodataframe.to_file(
+ output_file, driver=driver, engine=engine, **write_kwargs
+ )
+
+ reloaded = geopandas.read_file(output_file, engine=engine)
+
+ if driver == "GeoJSON" and engine == "pyogrio":
+ # For GeoJSON files, the int64 column comes back as int32
+ reloaded["a"] = reloaded["a"].astype("int64")
+
+ assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")
diff --git a/geopandas/io/tests/test_geoarrow.py b/geopandas/io/tests/test_geoarrow.py
index 1336e5bd..0087e6b0 100644
--- a/geopandas/io/tests/test_geoarrow.py
+++ b/geopandas/io/tests/test_geoarrow.py
@@ -3,14 +3,535 @@ import json
import os
import pathlib
from packaging.version import Version
+
import numpy as np
+
import shapely
from shapely import MultiPoint, Point, box
+
from geopandas import GeoDataFrame, GeoSeries
+
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
-pytest.importorskip('pyarrow')
+
+pytest.importorskip("pyarrow")
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import feather
-DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / 'data'
+
+DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
+
+
+def pa_table(table):
+ if Version(pa.__version__) < Version("14.0.0"):
+ return table._pa_table
+ else:
+ return pa.table(table)
+
+
+def pa_array(array):
+ if Version(pa.__version__) < Version("14.0.0"):
+ return array._pa_array
+ else:
+ return pa.array(array)
+
+
+def assert_table_equal(left, right, check_metadata=True):
+ geom_type = left["geometry"].type
+ # in case of Points (directly the inner fixed_size_list or struct type)
+ # -> there are NaNs for empties -> we need to compare them separately
+ # and then fill, because pyarrow.Table.equals considers NaNs as not equal
+ if pa.types.is_fixed_size_list(geom_type):
+ left_values = left["geometry"].chunk(0).values
+ right_values = right["geometry"].chunk(0).values
+ assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
+ left_geoms = pa.FixedSizeListArray.from_arrays(
+ pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
+ type=left["geometry"].type,
+ )
+ right_geoms = pa.FixedSizeListArray.from_arrays(
+ pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
+ type=right["geometry"].type,
+ )
+ left = left.set_column(1, left.schema.field("geometry"), left_geoms)
+ right = right.set_column(1, right.schema.field("geometry"), right_geoms)
+
+ elif pa.types.is_struct(geom_type):
+ left_arr = left["geometry"].chunk(0)
+ right_arr = right["geometry"].chunk(0)
+
+ for i in range(left_arr.type.num_fields):
+ assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
+
+ left_geoms = pa.StructArray.from_arrays(
+ [
+ pc.replace_with_mask(
+ left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
+ )
+ for i in range(left_arr.type.num_fields)
+ ],
+ fields=list(left["geometry"].type),
+ )
+ right_geoms = pa.StructArray.from_arrays(
+ [
+ pc.replace_with_mask(
+ right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
+ )
+ for i in range(right_arr.type.num_fields)
+ ],
+ fields=list(right["geometry"].type),
+ )
+
+ left = left.set_column(1, left.schema.field("geometry"), left_geoms)
+ right = right.set_column(1, right.schema.field("geometry"), right_geoms)
+
+ if left.equals(right, check_metadata=check_metadata):
+ return
+
+ if not left.schema.equals(right.schema):
+ raise AssertionError(
+ "Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
+ left.schema, right.schema
+ )
+ )
+
+ if check_metadata:
+ if not left.schema.equals(right.schema, check_metadata=True):
+ if not left.schema.metadata == right.schema.metadata:
+ raise AssertionError(
+ "Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
+ left.schema.metadata, right.schema.metadata
+ )
+ )
+ for col in left.schema.names:
+ assert left.schema.field(col).equals(
+ right.schema.field(col), check_metadata=True
+ )
+
+ for col in left.column_names:
+ a_left = pa.concat_arrays(left.column(col).chunks)
+ a_right = pa.concat_arrays(right.column(col).chunks)
+ if not a_left.equals(a_right):
+ raise AssertionError(
+ "Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
+ )
+
+ raise AssertionError("Tables not equal for unknown reason")
+
+
+@pytest.mark.skipif(
+ shapely.geos_version < (3, 9, 0),
+ reason="Checking for empty is buggy with GEOS<3.9",
+) # an old GEOS is installed in the CI builds with the defaults channel
+@pytest.mark.parametrize(
+ "dim",
+ [
+ "xy",
+ pytest.param(
+ "xyz",
+ marks=pytest.mark.skipif(
+ shapely.geos_version < (3, 10, 0),
+ reason="Cannot write 3D geometries with GEOS<3.10",
+ ),
+ ),
+ ],
+)
+@pytest.mark.parametrize(
+ "geometry_type",
+ ["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
+)
+@pytest.mark.parametrize(
+ "geometry_encoding, interleaved",
+ [("WKB", None), ("geoarrow", True), ("geoarrow", False)],
+ ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
+)
+def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
+ base_path = DATA_PATH / "geoarrow"
+ suffix = geometry_type + ("_z" if dim == "xyz" else "")
+
+ # Read the example data
+ df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
+ df["geometry"] = GeoSeries.from_wkb(df["geometry"])
+ df["row_number"] = df["row_number"].astype("int32")
+ df = GeoDataFrame(df)
+ df.geometry.array.crs = None
+
+ # Read the expected data
+ if geometry_encoding == "WKB":
+ filename = f"example-{suffix}-wkb.arrow"
+ else:
+ filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
+ expected = feather.read_table(base_path / filename)
+
+ # GeoDataFrame -> Arrow Table
+ result = pa_table(
+ df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
+ )
+ # remove the "pandas" metadata
+ result = result.replace_schema_metadata(None)
+
+ mask_nonempty = None
+ if (
+ geometry_encoding == "WKB"
+ and dim == "xyz"
+ and geometry_type.startswith("multi")
+ ):
+ # for collections with z dimension, drop the empties because those don't
+ # roundtrip correctly to WKB
+ # (https://github.com/libgeos/geos/issues/888)
+ mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
+ result = result.filter(mask_nonempty)
+ expected = expected.filter(mask_nonempty)
+
+ assert_table_equal(result, expected)
+
+ # GeoSeries -> Arrow array
+ if geometry_encoding != "WKB" and geometry_type == "point":
+ # for points, we again have to handle NaNs separately, we already did that
+ # for table so let's just skip this part
+ return
+ result_arr = pa_array(
+ df.geometry.to_arrow(
+ geometry_encoding=geometry_encoding, interleaved=interleaved
+ )
+ )
+ if mask_nonempty is not None:
+ result_arr = result_arr.filter(mask_nonempty)
+ assert result_arr.equals(expected["geometry"].chunk(0))
+
+
+@pytest.mark.skipif(
+ Version(shapely.__version__) < Version("2.0.2"),
+ reason="from_ragged_array failing with read-only array input",
+)
+@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
+def test_geoarrow_multiple_geometry_crs(encoding):
+ pytest.importorskip("pyproj")
+ # ensure each geometry column has its own crs
+ gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
+ gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
+
+ result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
+ meta1 = json.loads(
+ result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
+ )
+ assert json.loads(meta1["crs"])["id"]["code"] == 4326
+ meta2 = json.loads(
+ result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
+ )
+ assert json.loads(meta2["crs"])["id"]["code"] == 3857
+
+ roundtripped = GeoDataFrame.from_arrow(result)
+ assert_geodataframe_equal(gdf, roundtripped)
+ assert gdf.geometry.crs == "epsg:4326"
+ assert gdf.geom2.crs == "epsg:3857"
+
+
+@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
+def test_geoarrow_series_name_crs(encoding):
+ pytest.importorskip("pyproj")
+ pytest.importorskip("pyarrow", minversion="14.0.0")
+
+ gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
+ schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
+ field = pa.Field._import_from_c_capsule(schema_capsule)
+ assert field.name == "geom"
+ assert (
+ field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
+ if encoding == "WKB"
+ else b"geoarrow.polygon"
+ )
+ meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
+ assert json.loads(meta["crs"])["id"]["code"] == 4326
+
+ # ensure it also works without a name
+ gser = GeoSeries([box(0, 0, 10, 10)])
+ schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
+ field = pa.Field._import_from_c_capsule(schema_capsule)
+ assert field.name == ""
+
+
+def test_geoarrow_unsupported_encoding():
+ gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
+
+ with pytest.raises(ValueError, match="Expected geometry encoding"):
+ gdf.to_arrow(geometry_encoding="invalid")
+
+ with pytest.raises(ValueError, match="Expected geometry encoding"):
+ gdf.geometry.to_arrow(geometry_encoding="invalid")
+
+
+def test_geoarrow_mixed_geometry_types():
+ gdf = GeoDataFrame(
+ {"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
+ crs="epsg:4326",
+ )
+
+ with pytest.raises(ValueError, match="Geometry type combination is not supported"):
+ gdf.to_arrow(geometry_encoding="geoarrow")
+
+ gdf = GeoDataFrame(
+ {"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
+ crs="epsg:4326",
+ )
+ result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
+ assert (
+ result.schema.field("geometry").metadata[b"ARROW:extension:name"]
+ == b"geoarrow.multipoint"
+ )
+
+
+@pytest.mark.parametrize("geom_type", ["point", "polygon"])
+@pytest.mark.parametrize(
+ "encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
+)
+def test_geoarrow_missing(encoding, interleaved, geom_type):
+ # dummy test for single geometry type until missing values are included
+ # in the test data for test_geoarrow_export
+ gdf = GeoDataFrame(
+ geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
+ crs="epsg:4326",
+ )
+ if (
+ encoding == "geoarrow"
+ and geom_type == "point"
+ and interleaved
+ and Version(pa.__version__) < Version("15.0.0")
+ ):
+ with pytest.raises(
+ ValueError,
+ match="Converting point geometries with missing values is not supported",
+ ):
+ gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
+ return
+ result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
+ assert result["geometry"].null_count == 1
+ assert result["geometry"].is_null().to_pylist() == [False, True]
+
+
+def test_geoarrow_include_z():
+ gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
+
+ table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
+ assert table["geometry"].type.value_field.name == "xy"
+ assert table["geometry"].type.list_size == 2
+
+ table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
+ assert table["geometry"].type.value_field.name == "xyz"
+ assert table["geometry"].type.list_size == 3
+ assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
+
+ gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
+
+ table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
+ assert table["geometry"].type.value_field.name == "xyz"
+ assert table["geometry"].type.list_size == 3
+
+ table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
+ assert table["geometry"].type.value_field.name == "xy"
+ assert table["geometry"].type.list_size == 2
+
+
+@contextlib.contextmanager
+def with_geoarrow_extension_types():
+ gp = pytest.importorskip("geoarrow.pyarrow")
+ gp.register_extension_types()
+ try:
+ yield
+ finally:
+ gp.unregister_extension_types()
+
+
+@pytest.mark.parametrize("dim", ["xy", "xyz"])
+@pytest.mark.parametrize(
+ "geometry_type",
+ ["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
+)
+def test_geoarrow_export_with_extension_types(geometry_type, dim):
+ # ensure the exported data can be imported by geoarrow-pyarrow and are
+ # recognized as extension types
+ base_path = DATA_PATH / "geoarrow"
+ suffix = geometry_type + ("_z" if dim == "xyz" else "")
+
+ # Read the example data
+ df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
+ df["geometry"] = GeoSeries.from_wkb(df["geometry"])
+ df["row_number"] = df["row_number"].astype("int32")
+ df = GeoDataFrame(df)
+ df.geometry.array.crs = None
+
+ pytest.importorskip("geoarrow.pyarrow")
+
+ with with_geoarrow_extension_types():
+ result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
+ assert isinstance(result1["geometry"].type, pa.ExtensionType)
+
+ result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
+ assert isinstance(result2["geometry"].type, pa.ExtensionType)
+
+ result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
+ assert isinstance(result3["geometry"].type, pa.ExtensionType)
+
+
+@pytest.mark.skipif(
+ Version(shapely.__version__) < Version("2.0.2"),
+ reason="from_ragged_array failing with read-only array input",
+)
+@pytest.mark.parametrize("dim", ["xy", "xyz"])
+@pytest.mark.parametrize(
+ "geometry_type",
+ [
+ "point",
+ "linestring",
+ "polygon",
+ "multipoint",
+ "multilinestring",
+ "multipolygon",
+ ],
+)
+def test_geoarrow_import(geometry_type, dim):
+ base_path = DATA_PATH / "geoarrow"
+ suffix = geometry_type + ("_z" if dim == "xyz" else "")
+
+ # Read the example data
+ df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
+ df["geometry"] = GeoSeries.from_wkb(df["geometry"])
+ df = GeoDataFrame(df)
+ df.geometry.crs = None
+
+ table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
+ result1 = GeoDataFrame.from_arrow(table1)
+ assert_geodataframe_equal(result1, df)
+
+ table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
+ result2 = GeoDataFrame.from_arrow(table2)
+ assert_geodataframe_equal(result2, df)
+
+ table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
+ result3 = GeoDataFrame.from_arrow(table3)
+ assert_geodataframe_equal(result3, df)
+
+
+@pytest.mark.skipif(
+ Version(shapely.__version__) < Version("2.0.2"),
+ reason="from_ragged_array failing with read-only array input",
+)
+@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
+def test_geoarrow_import_geometry_column(encoding):
+ pytest.importorskip("pyproj")
+ # ensure each geometry column has its own crs
+ gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
+ gdf["centroid"] = gdf.geometry.centroid
+
+ result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
+ assert_geodataframe_equal(result, gdf)
+ assert result.active_geometry_name == "geometry"
+
+ result = GeoDataFrame.from_arrow(
+ pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
+ )
+ assert result.active_geometry_name == "centroid"
+
+ result = GeoDataFrame.from_arrow(
+ pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
+ )
+ assert result.active_geometry_name == "centroid"
+ assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
+
+
+def test_geoarrow_import_missing_geometry():
+ pytest.importorskip("pyarrow", minversion="14.0.0")
+
+ table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
+ with pytest.raises(ValueError, match="No geometry column found"):
+ GeoDataFrame.from_arrow(table)
+
+ with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
+ GeoSeries.from_arrow(table["a"].chunk(0))
+
+
+def test_geoarrow_import_capsule_interface():
+ # ensure we can import non-pyarrow object
+ pytest.importorskip("pyarrow", minversion="14.0.0")
+ gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
+
+ result = GeoDataFrame.from_arrow(gdf.to_arrow())
+ assert_geodataframe_equal(result, gdf)
+
+
+@pytest.mark.parametrize("dim", ["xy", "xyz"])
+@pytest.mark.parametrize(
+ "geometry_type",
+ ["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
+)
+def test_geoarrow_import_from_extension_types(geometry_type, dim):
+ # ensure the exported data can be imported by geoarrow-pyarrow and are
+ # recognized as extension types
+ pytest.importorskip("pyproj")
+ base_path = DATA_PATH / "geoarrow"
+ suffix = geometry_type + ("_z" if dim == "xyz" else "")
+
+ # Read the example data
+ df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
+ df["geometry"] = GeoSeries.from_wkb(df["geometry"])
+ df = GeoDataFrame(df, crs="EPSG:3857")
+
+ pytest.importorskip("geoarrow.pyarrow")
+
+ with with_geoarrow_extension_types():
+ result1 = GeoDataFrame.from_arrow(
+ pa_table(df.to_arrow(geometry_encoding="WKB"))
+ )
+ assert_geodataframe_equal(result1, df)
+
+ result2 = GeoDataFrame.from_arrow(
+ pa_table(df.to_arrow(geometry_encoding="geoarrow"))
+ )
+ assert_geodataframe_equal(result2, df)
+
+ result3 = GeoDataFrame.from_arrow(
+ pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
+ )
+ assert_geodataframe_equal(result3, df)
+
+
+def test_geoarrow_import_geoseries():
+ pytest.importorskip("pyproj")
+ gp = pytest.importorskip("geoarrow.pyarrow")
+ ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
+
+ with with_geoarrow_extension_types():
+ arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
+ result = GeoSeries.from_arrow(arr)
+ assert_geoseries_equal(result, ser)
+
+ arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
+ result = GeoSeries.from_arrow(arr)
+ assert_geoseries_equal(result, ser)
+
+ # the name is lost when going through a pyarrow.Array
+ ser.name = "name"
+ arr = gp.array(ser.to_arrow())
+ result = GeoSeries.from_arrow(arr)
+ assert result.name is None
+ # we can specify the name as one of the kwargs
+ result = GeoSeries.from_arrow(arr, name="test")
+ assert_geoseries_equal(result, ser)
+
+
+def test_geoarrow_import_unknown_geoarrow_type():
+ gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
+ table = pa_table(gdf.to_arrow())
+ schema = table.schema
+ new_field = schema.field("geometry").with_metadata(
+ {
+ b"ARROW:extension:name": b"geoarrow.unknown",
+ b"ARROW:extension:metadata": b"{}",
+ }
+ )
+
+ new_schema = pa.schema([schema.field(0), new_field])
+ new_table = table.cast(new_schema)
+
+ with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
+ GeoDataFrame.from_arrow(new_table)
diff --git a/geopandas/io/tests/test_infer_schema.py b/geopandas/io/tests/test_infer_schema.py
index 014ddf7d..61a72171 100644
--- a/geopandas/io/tests/test_infer_schema.py
+++ b/geopandas/io/tests/test_infer_schema.py
@@ -1,31 +1,306 @@
from collections import OrderedDict
+
import numpy as np
import pandas as pd
-from shapely.geometry import LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon
+
+from shapely.geometry import (
+ LineString,
+ MultiLineString,
+ MultiPoint,
+ MultiPolygon,
+ Point,
+ Polygon,
+)
+
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
+
import pytest
-city_hall_boundaries = Polygon(((-73.5541107525234, 45.5091983609661), (-
- 73.5546126200639, 45.5086813829106), (-73.5540185061397,
- 45.5084409343852), (-73.5539986525799, 45.5084323044531), (-
- 73.5535801792994, 45.5089539203786), (-73.5541107525234, 45.5091983609661))
- )
-vauquelin_place = Polygon(((-73.5542465586147, 45.5081555487952), (-
- 73.5540185061397, 45.5084409343852), (-73.5546126200639,
- 45.5086813829106), (-73.5548825850032, 45.5084033554357), (-
- 73.5542465586147, 45.5081555487952)))
-city_hall_walls = [LineString(((-73.5541107525234, 45.5091983609661), (-
- 73.5546126200639, 45.5086813829106), (-73.5540185061397,
- 45.5084409343852))), LineString(((-73.5539986525799, 45.5084323044531),
- (-73.5535801792994, 45.5089539203786), (-73.5541107525234,
- 45.5091983609661)))]
+
+# Credit: Polygons below come from Montreal city Open Data portal
+# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
+city_hall_boundaries = Polygon(
+ (
+ (-73.5541107525234, 45.5091983609661),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5540185061397, 45.5084409343852),
+ (-73.5539986525799, 45.5084323044531),
+ (-73.5535801792994, 45.5089539203786),
+ (-73.5541107525234, 45.5091983609661),
+ )
+)
+vauquelin_place = Polygon(
+ (
+ (-73.5542465586147, 45.5081555487952),
+ (-73.5540185061397, 45.5084409343852),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5548825850032, 45.5084033554357),
+ (-73.5542465586147, 45.5081555487952),
+ )
+)
+
+city_hall_walls = [
+ LineString(
+ (
+ (-73.5541107525234, 45.5091983609661),
+ (-73.5546126200639, 45.5086813829106),
+ (-73.5540185061397, 45.5084409343852),
+ )
+ ),
+ LineString(
+ (
+ (-73.5539986525799, 45.5084323044531),
+ (-73.5535801792994, 45.5089539203786),
+ (-73.5541107525234, 45.5091983609661),
+ )
+ ),
+]
+
city_hall_entrance = Point(-73.553785, 45.508722)
-city_hall_balcony = Point(-73.554138, 45.50908)
+city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
+
point_3D = Point(-73.553785, 45.508722, 300)
-linestring_3D = LineString(((-73.5541107525234, 45.5091983609661, 300), (-
- 73.5546126200639, 45.5086813829106, 300), (-73.5540185061397,
- 45.5084409343852, 300)))
-polygon_3D = Polygon(((-73.5541107525234, 45.5091983609661, 300), (-
- 73.5535801792994, 45.5089539203786, 300), (-73.5541107525234,
- 45.5091983609661, 300)))
+linestring_3D = LineString(
+ (
+ (-73.5541107525234, 45.5091983609661, 300),
+ (-73.5546126200639, 45.5086813829106, 300),
+ (-73.5540185061397, 45.5084409343852, 300),
+ )
+)
+polygon_3D = Polygon(
+ (
+ (-73.5541107525234, 45.5091983609661, 300),
+ (-73.5535801792994, 45.5089539203786, 300),
+ (-73.5541107525234, 45.5091983609661, 300),
+ )
+)
+
+
+def test_infer_schema_only_points():
+ df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
+
+ assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
+
+
+def test_infer_schema_points_and_multipoints():
+ df = GeoDataFrame(
+ geometry=[
+ MultiPoint([city_hall_entrance, city_hall_balcony]),
+ city_hall_balcony,
+ ]
+ )
+
+ assert infer_schema(df) == {
+ "geometry": ["MultiPoint", "Point"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_multipoints():
+ df = GeoDataFrame(
+ geometry=[
+ MultiPoint(
+ [city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
+ )
+ ]
+ )
+
+ assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
+
+
+def test_infer_schema_only_linestrings():
+ df = GeoDataFrame(geometry=city_hall_walls)
+
+ assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
+
+
+def test_infer_schema_linestrings_and_multilinestrings():
+ df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
+
+ assert infer_schema(df) == {
+ "geometry": ["MultiLineString", "LineString"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_multilinestrings():
+ df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
+
+ assert infer_schema(df) == {
+ "geometry": "MultiLineString",
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_polygons():
+ df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
+
+ assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
+
+
+def test_infer_schema_polygons_and_multipolygons():
+ df = GeoDataFrame(
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_boundaries,
+ ]
+ )
+
+ assert infer_schema(df) == {
+ "geometry": ["MultiPolygon", "Polygon"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_multipolygons():
+ df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
+
+ assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
+
+
+def test_infer_schema_multiple_shape_types():
+ df = GeoDataFrame(
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_boundaries,
+ MultiLineString(city_hall_walls),
+ city_hall_walls[0],
+ MultiPoint([city_hall_entrance, city_hall_balcony]),
+ city_hall_balcony,
+ ]
+ )
+
+ assert infer_schema(df) == {
+ "geometry": [
+ "MultiPolygon",
+ "Polygon",
+ "MultiLineString",
+ "LineString",
+ "MultiPoint",
+ "Point",
+ ],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_mixed_3D_shape_type():
+ df = GeoDataFrame(
+ geometry=[
+ MultiPolygon((city_hall_boundaries, vauquelin_place)),
+ city_hall_boundaries,
+ MultiLineString(city_hall_walls),
+ city_hall_walls[0],
+ MultiPoint([city_hall_entrance, city_hall_balcony]),
+ city_hall_balcony,
+ point_3D,
+ ]
+ )
+
+ assert infer_schema(df) == {
+ "geometry": [
+ "3D Point",
+ "MultiPolygon",
+ "Polygon",
+ "MultiLineString",
+ "LineString",
+ "MultiPoint",
+ "Point",
+ ],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_mixed_3D_Point():
+ df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
+
+ assert infer_schema(df) == {
+ "geometry": ["3D Point", "Point"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_3D_Points():
+ df = GeoDataFrame(geometry=[point_3D, point_3D])
+
+ assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
+
+
+def test_infer_schema_mixed_3D_linestring():
+ df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
+
+ assert infer_schema(df) == {
+ "geometry": ["3D LineString", "LineString"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_3D_linestrings():
+ df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
+
+ assert infer_schema(df) == {
+ "geometry": "3D LineString",
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_mixed_3D_Polygon():
+ df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
+
+ assert infer_schema(df) == {
+ "geometry": ["3D Polygon", "Polygon"],
+ "properties": OrderedDict(),
+ }
+
+
+def test_infer_schema_only_3D_Polygons():
+ df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
+
+ assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
+
+
+def test_infer_schema_null_geometry_and_2D_point():
+ df = GeoDataFrame(geometry=[None, city_hall_entrance])
+
+ # None geometry type is then omitted
+ assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
+
+
+def test_infer_schema_null_geometry_and_3D_point():
+ df = GeoDataFrame(geometry=[None, point_3D])
+
+ # None geometry type is then omitted
+ assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
+
+
+def test_infer_schema_null_geometry_all():
+ df = GeoDataFrame(geometry=[None, None])
+
+ # None geometry type in then replaced by 'Unknown'
+ # (default geometry type supported by Fiona)
+ assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
+
+
+@pytest.mark.parametrize(
+ "array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
+)
+def test_infer_schema_int32(array_data, dtype):
+ int32col = pd.array(data=array_data, dtype=dtype)
+ df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
+ df["int32_column"] = int32col
+
+ assert infer_schema(df) == {
+ "geometry": "Point",
+ "properties": OrderedDict([("int32_column", "int32")]),
+ }
+
+
+def test_infer_schema_int64():
+ int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
+ df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
+ df["int64_column"] = int64col
+
+ assert infer_schema(df) == {
+ "geometry": "Point",
+ "properties": OrderedDict([("int64_column", "int")]),
+ }
diff --git a/geopandas/io/tests/test_pickle.py b/geopandas/io/tests/test_pickle.py
index 03867e3a..6d962807 100644
--- a/geopandas/io/tests/test_pickle.py
+++ b/geopandas/io/tests/test_pickle.py
@@ -2,11 +2,55 @@
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
+
import glob
import os
import pathlib
+
import pandas as pd
+
import pytest
from geopandas.testing import assert_geodataframe_equal
-DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / 'data'
-files = glob.glob(str(DATA_PATH / 'pickle' / '*.pickle'))
+
+DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
+
+
+@pytest.fixture(scope="module")
+def current_pickle_data():
+ # our current version pickle data
+ from .generate_legacy_storage_files import create_pickle_data
+
+ return create_pickle_data()
+
+
+files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
+
+
+@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
+def legacy_pickle(request):
+ return request.param
+
+
+@pytest.mark.skip(
+ reason=(
+ "shapely 2.0/pygeos-based unpickling currently only works for "
+ "shapely-2.0/pygeos-written files"
+ ),
+)
+def test_legacy_pickles(current_pickle_data, legacy_pickle):
+ result = pd.read_pickle(legacy_pickle)
+
+ for name, value in result.items():
+ expected = current_pickle_data[name]
+ assert_geodataframe_equal(value, expected)
+
+
+def test_round_trip_current(tmpdir, current_pickle_data):
+ data = current_pickle_data
+
+ for name, value in data.items():
+ path = str(tmpdir / "{}.pickle".format(name))
+ value.to_pickle(path)
+ result = pd.read_pickle(path)
+ assert_geodataframe_equal(result, value)
+ assert isinstance(result.has_sindex, bool)
diff --git a/geopandas/io/tests/test_sql.py b/geopandas/io/tests/test_sql.py
index 00f0209c..d394098f 100644
--- a/geopandas/io/tests/test_sql.py
+++ b/geopandas/io/tests/test_sql.py
@@ -4,38 +4,67 @@ The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
+
import os
import warnings
from importlib.util import find_spec
+
import pandas as pd
+
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, read_file, read_postgis
from geopandas._compat import HAS_PYPROJ
from geopandas.io.sql import _get_conn as get_conn
from geopandas.io.sql import _write_postgis as write_postgis
+
import pytest
-from geopandas.tests.util import create_postgis, create_spatialite, mock, validate_boro_df
+from geopandas.tests.util import (
+ create_postgis,
+ create_spatialite,
+ mock,
+ validate_boro_df,
+)
+
try:
from sqlalchemy import text
except ImportError:
+ # Avoid local imports for text in all sqlalchemy tests
+ # all tests using text use engine_postgis, which ensures sqlalchemy is available
text = str
-def check_available_postgis_drivers() ->list[str]:
+@pytest.fixture
+def df_nybb(nybb_filename):
+ df = read_file(nybb_filename)
+ return df
+
+
+def check_available_postgis_drivers() -> list[str]:
"""Work out which of psycopg2 and psycopg are available.
This prevents tests running if the relevant package isn't installed
(rather than being skipped, as skips are treated as failures during postgis CI)
"""
- pass
+ drivers = []
+ if find_spec("psycopg"):
+ drivers.append("psycopg")
+ if find_spec("psycopg2"):
+ drivers.append("psycopg2")
+ return drivers
POSTGIS_DRIVERS = check_available_postgis_drivers()
-def prepare_database_credentials() ->dict:
+def prepare_database_credentials() -> dict:
"""Gather postgres connection credentials from environment variables."""
- pass
+ return {
+ "dbname": "test_geopandas",
+ "user": os.environ.get("PGUSER"),
+ "password": os.environ.get("PGPASSWORD"),
+ "host": os.environ.get("PGHOST"),
+ "port": os.environ.get("PGPORT"),
+ }
@pytest.fixture()
@@ -43,7 +72,18 @@ def connection_postgis(request):
"""Create a postgres connection using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
- pass
+ psycopg = pytest.importorskip(request.param)
+
+ try:
+ con = psycopg.connect(**prepare_database_credentials())
+ except psycopg.OperationalError:
+ pytest.skip("Cannot connect with postgresql database")
+ with warnings.catch_warnings():
+ warnings.filterwarnings(
+ "ignore", message="pandas only supports SQLAlchemy connectable.*"
+ )
+ yield con
+ con.close()
@pytest.fixture()
@@ -53,7 +93,27 @@ def engine_postgis(request):
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
"""
- pass
+ sqlalchemy = pytest.importorskip("sqlalchemy")
+ from sqlalchemy.engine.url import URL
+
+ credentials = prepare_database_credentials()
+ try:
+ con = sqlalchemy.create_engine(
+ URL.create(
+ drivername=f"postgresql+{request.param}",
+ username=credentials["user"],
+ database=credentials["dbname"],
+ password=credentials["password"],
+ host=credentials["host"],
+ port=credentials["port"],
+ )
+ )
+ con.connect()
+ except Exception:
+ pytest.skip("Cannot connect with postgresql database")
+
+ yield con
+ con.dispose()
@pytest.fixture()
@@ -72,186 +132,747 @@ def connection_spatialite():
``AttributeError`` on missing support for loadable SQLite extensions
``sqlite3.OperationalError`` on missing SpatiaLite
"""
- pass
+ sqlite3 = pytest.importorskip("sqlite3")
+ try:
+ with sqlite3.connect(":memory:") as con:
+ con.enable_load_extension(True)
+ con.load_extension("mod_spatialite")
+ con.execute("SELECT InitSpatialMetaData(TRUE)")
+ except Exception:
+ con.close()
+ pytest.skip("Cannot setup spatialite database")
+
+ yield con
+ con.close()
+
+
+def drop_table_if_exists(conn_or_engine, table):
+ sqlalchemy = pytest.importorskip("sqlalchemy")
+
+ if sqlalchemy.inspect(conn_or_engine).has_table(table):
+ metadata = sqlalchemy.MetaData()
+ with warnings.catch_warnings():
+ warnings.filterwarnings(
+ "ignore", message="Did not recognize type 'geometry' of column.*"
+ )
+ metadata.reflect(conn_or_engine)
+ table = metadata.tables.get(table)
+ if table is not None:
+ table.drop(conn_or_engine, checkfirst=True)
+
+
+@pytest.fixture
+def df_mixed_single_and_multi():
+ from shapely.geometry import LineString, MultiLineString, Point
+
+ df = geopandas.GeoDataFrame(
+ {
+ "geometry": [
+ LineString([(0, 0), (1, 1)]),
+ MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
+ Point(0, 1),
+ ]
+ },
+ crs="epsg:4326",
+ )
+ return df
+
+
+@pytest.fixture
+def df_geom_collection():
+ from shapely.geometry import GeometryCollection, LineString, Point, Polygon
+
+ df = geopandas.GeoDataFrame(
+ {
+ "geometry": [
+ GeometryCollection(
+ [
+ Polygon([(0, 0), (1, 1), (0, 1)]),
+ LineString([(0, 0), (1, 1)]),
+ Point(0, 0),
+ ]
+ )
+ ]
+ },
+ crs="epsg:4326",
+ )
+ return df
+
+
+@pytest.fixture
+def df_linear_ring():
+ from shapely.geometry import LinearRing
+
+ df = geopandas.GeoDataFrame(
+ {"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
+ )
+ return df
+
+
+@pytest.fixture
+def df_3D_geoms():
+ from shapely.geometry import LineString, Point, Polygon
+
+ df = geopandas.GeoDataFrame(
+ {
+ "geometry": [
+ LineString([(0, 0, 0), (1, 1, 1)]),
+ Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
+ Point(0, 1, 2),
+ ]
+ },
+ crs="epsg:4326",
+ )
+ return df
class TestIO:
-
- @pytest.mark.parametrize('connection_postgis', POSTGIS_DRIVERS,
- indirect=True)
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_get_conn(self, engine_postgis):
+ Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
+
+ engine = engine_postgis
+ with get_conn(engine) as output:
+ assert isinstance(output, Connection)
+ with engine.connect() as conn:
+ with get_conn(conn) as output:
+ assert isinstance(output, Connection)
+ with pytest.raises(ValueError):
+ with get_conn(object()):
+ pass
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_read_postgis_default(self, connection_postgis, df_nybb):
+ con = connection_postgis
+ create_postgis(con, df_nybb)
+
+ sql = "SELECT * FROM nybb;"
+ df = read_postgis(sql, con)
+
+ validate_boro_df(df)
+ # no crs defined on the created geodatabase, and none specified
+ # by user; should not be set to 0, as from get_srid failure
+ assert df.crs is None
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
+ con = connection_postgis
+ geom_col = "the_geom"
+ create_postgis(con, df_nybb, geom_col=geom_col)
+
+ sql = "SELECT * FROM nybb;"
+ df = read_postgis(sql, con, geom_col=geom_col)
+
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
- pass
+ con = connection_postgis
+ orig_geom = "geom"
+ out_geom = "the_geom"
+ create_postgis(con, df_nybb, geom_col=orig_geom)
+
+ sql = """SELECT borocode, boroname, shape_leng, shape_area,
+ {} as {} FROM nybb;""".format(
+ orig_geom, out_geom
+ )
+ df = read_postgis(sql, con, geom_col=out_geom)
+
+ validate_boro_df(df)
- @pytest.mark.parametrize('connection_postgis', POSTGIS_DRIVERS,
- indirect=True)
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
- pass
+ con = connection_postgis
+ crs = "epsg:4269"
+ df_reproj = df_nybb.to_crs(crs)
+ create_postgis(con, df_reproj, srid=4269)
+
+ sql = "SELECT * FROM nybb;"
+ df = read_postgis(sql, con)
- @pytest.mark.parametrize('connection_postgis', POSTGIS_DRIVERS,
- indirect=True)
+ validate_boro_df(df)
+ assert df.crs == crs
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
- pass
+ con = connection_postgis
+ orig_crs = df_nybb.crs
+ create_postgis(con, df_nybb, srid=4269)
+
+ sql = "SELECT * FROM nybb;"
+ df = read_postgis(sql, con, crs=orig_crs)
+
+ validate_boro_df(df)
+ assert df.crs == orig_crs
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_from_postgis_default(self, connection_postgis, df_nybb):
+ con = connection_postgis
+ create_postgis(con, df_nybb)
+
+ sql = "SELECT * FROM nybb;"
+ df = GeoDataFrame.from_postgis(sql, con)
+
+ validate_boro_df(df, case_sensitive=False)
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
+ con = connection_postgis
+ geom_col = "the_geom"
+ create_postgis(con, df_nybb, geom_col=geom_col)
+
+ sql = "SELECT * FROM nybb;"
+ df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
+
+ validate_boro_df(df, case_sensitive=False)
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
"""Tests that geometry with NULL is accepted."""
- pass
+ con = connection_spatialite
+ geom_col = df_nybb.geometry.name
+ df_nybb.geometry.iat[0] = None
+ create_spatialite(con, df_nybb)
+ sql = (
+ "SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
+ 'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
+ )
+ df = read_postgis(sql, con, geom_col=geom_col)
+ validate_boro_df(df)
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
"""Tests that geometry read as binary is accepted."""
- pass
+ con = connection_spatialite
+ geom_col = df_nybb.geometry.name
+ create_spatialite(con, df_nybb)
+ sql = (
+ "SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
+ 'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
+ )
+ df = read_postgis(sql, con, geom_col=geom_col)
+ validate_boro_df(df)
- @pytest.mark.parametrize('connection_postgis', POSTGIS_DRIVERS,
- indirect=True)
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
- pass
+ chunksize = 2
+ con = connection_postgis
+ create_postgis(con, df_nybb)
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ sql = "SELECT * FROM nybb;"
+ df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
+
+ validate_boro_df(df)
+ # no crs defined on the created geodatabase, and none specified
+ # by user; should not be set to 0, as from get_srid failure
+ assert df.crs is None
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
- pass
+ engine = engine_postgis
+ table = "nybb"
+
+ # If table exists, delete it before trying to write with defaults
+ drop_table_if_exists(engine, table)
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ # Write to db
+ write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
+ # Validate
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
- pass
+ engine = engine_postgis
+ table = "aTestTable"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb
- ):
+ # If table exists, delete it before trying to write with defaults
+ drop_table_if_exists(engine, table)
+
+ # Write to db
+ write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
+ # Validate
+ sql = text('SELECT * FROM "{table}";'.format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
- pass
+ with engine_postgis.begin() as con:
+ table = "nybb_con"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb
- ):
+ # If table exists, delete it before trying to write with defaults
+ drop_table_if_exists(con, table)
+
+ # Write to db
+ write_postgis(df_nybb, con=con, name=table, if_exists="fail")
+ # Validate
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, con, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
"""
- pass
+ engine = engine_postgis
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_replace_when_table_exists(self, engine_postgis,
- df_nybb):
+ table = "nybb"
+
+ # Ensure table exists
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+
+ try:
+ write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
+ except ValueError as e:
+ if "already exists" in str(e):
+ pass
+ else:
+ raise e
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
"""
- pass
+ engine = engine_postgis
+
+ table = "nybb"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_append_when_table_exists(self, engine_postgis,
- df_nybb):
+ # Ensure table exists
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+ # Overwrite
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+ # Validate
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
if_replace='append'.
"""
- pass
+ engine = engine_postgis
+
+ table = "nybb"
+
+ orig_rows, orig_cols = df_nybb.shape
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+ write_postgis(df_nybb, con=engine, name=table, if_exists="append")
+ # Validate
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ new_rows, new_cols = df.shape
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ # There should be twice as many rows in the new table
+ assert new_rows == orig_rows * 2, (
+ "There should be {target} rows,found: {current}".format(
+ target=orig_rows * 2, current=new_rows
+ ),
+ )
+ # Number of columns should stay the same
+ assert new_cols == orig_cols, (
+ "There should be {target} columns,found: {current}".format(
+ target=orig_cols, current=new_cols
+ ),
+ )
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
"""
- pass
+ engine = engine_postgis
+
+ table = "nybb"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ # Write to db
+ df_nybb.geometry.array.crs = None
+ with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+ # Validate that srid is -1
+ sql = text(
+ "SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
+ schema="public", table=table, geom_col="geometry"
+ )
+ )
+ with engine.connect() as conn:
+ target_srid = conn.execute(sql).fetchone()[0]
+ assert target_srid == 0, "SRID should be 0, found %s" % target_srid
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
CRS information (GH #2414).
"""
- pass
+ engine = engine_postgis
+
+ table = "nybb"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_geometry_collection(self, engine_postgis,
- df_geom_collection):
+ # Write to db
+ df_nybb_esri = df_nybb.to_crs("ESRI:102003")
+ write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
+ # Validate that srid is 102003
+ sql = text(
+ "SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
+ schema="public", table=table, geom_col="geometry"
+ )
+ )
+ with engine.connect() as conn:
+ target_srid = conn.execute(sql).fetchone()[0]
+ assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_geometry_collection(
+ self, engine_postgis, df_geom_collection
+ ):
"""
Tests that writing a mix of different geometry types is possible.
"""
- pass
+ engine = engine_postgis
+
+ table = "geomtype_tests"
+
+ write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
+
+ # Validate geometry type
+ sql = text(
+ "SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
+ table=table
+ )
+ )
+ with engine.connect() as conn:
+ geom_type = conn.execute(sql).fetchone()[0]
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+
+ assert geom_type.upper() == "GEOMETRYCOLLECTION"
+ assert df.geom_type.unique()[0] == "GeometryCollection"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_mixed_geometry_types(self, engine_postgis,
- df_mixed_single_and_multi):
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_mixed_geometry_types(
+ self, engine_postgis, df_mixed_single_and_multi
+ ):
"""
Tests that writing a mix of single and MultiGeometries is possible.
"""
- pass
+ engine = engine_postgis
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ table = "geomtype_tests"
+
+ write_postgis(
+ df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
+ )
+
+ # Validate geometry type
+ sql = text(
+ "SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
+ table=table
+ )
+ )
+ with engine.connect() as conn:
+ res = conn.execute(sql).fetchall()
+ assert res[0][0].upper() == "LINESTRING"
+ assert res[1][0].upper() == "MULTILINESTRING"
+ assert res[2][0].upper() == "POINT"
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
"""
- pass
+ engine = engine_postgis
+
+ table = "geomtype_tests"
+
+ write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_in_chunks(self, engine_postgis,
- df_mixed_single_and_multi):
+ # Validate geometry type
+ sql = text(
+ "SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
+ table=table
+ )
+ )
+ with engine.connect() as conn:
+ geom_type = conn.execute(sql).fetchone()[0]
+
+ assert geom_type.upper() == "LINESTRING"
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
"""
- pass
+ engine = engine_postgis
+
+ table = "geomtype_tests"
+
+ write_postgis(
+ df_mixed_single_and_multi,
+ con=engine,
+ name=table,
+ if_exists="replace",
+ chunksize=1,
+ )
+ # Validate row count
+ sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
+ with engine.connect() as conn:
+ row_cnt = conn.execute(sql).fetchone()[0]
+ assert row_cnt == 3
+
+ # Validate geometry type
+ sql = text(
+ "SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
+ table=table
+ )
+ )
+ with engine.connect() as conn:
+ res = conn.execute(sql).fetchall()
+ assert res[0][0].upper() == "LINESTRING"
+ assert res[1][0].upper() == "MULTILINESTRING"
+ assert res[2][0].upper() == "POINT"
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
"""
- pass
+ engine = engine_postgis
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- def test_write_postgis_to_different_schema_when_table_exists(self,
- engine_postgis, df_nybb):
+ table = "nybb"
+ schema_to_use = "test"
+ sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
+ with engine.begin() as conn:
+ conn.execute(sql)
+
+ write_postgis(
+ df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
+ )
+ # Validate
+ sql = text(
+ "SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
+ )
+
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_write_postgis_to_different_schema_when_table_exists(
+ self, engine_postgis, df_nybb
+ ):
"""
Tests writing data to alternative schema.
"""
- pass
+ engine = engine_postgis
+
+ table = "nybb"
+ schema_to_use = "test"
+ sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
+ with engine.begin() as conn:
+ conn.execute(sql)
+
+ try:
+ write_postgis(
+ df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
+ )
+ # Validate
+ sql = text(
+ "SELECT * FROM {schema}.{table};".format(
+ schema=schema_to_use, table=table
+ )
+ )
+
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ # Should raise a ValueError when table exists
+ except ValueError:
+ pass
+
+ # Try with replace flag on
+ write_postgis(
+ df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
+ )
+ # Validate
+ sql = text(
+ "SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
+ )
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
"""
- pass
+ engine = engine_postgis
+
+ table = "geomtype_tests"
+
+ write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
+
+ # Check that all geometries have 3 dimensions
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ assert list(df.geometry.has_z) == [True, True, True]
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
"""
- pass
+ engine = engine_postgis
+
+ table = "row_order_test"
+ correct_order = df_nybb["BoroCode"].tolist()
+
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+
+ # Check that the row order matches
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ assert df["BoroCode"].tolist() == correct_order
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
"""
- pass
+ engine = engine_postgis
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
+ table = "nybb"
+ # If table exists, delete it before trying to write with defaults
+ drop_table_if_exists(engine, table)
+
+ write_postgis(df_nybb, con=engine, name=table, if_exists="append")
+
+ # Check that the row order matches
+ sql = text("SELECT * FROM {table};".format(table=table))
+ df = read_postgis(sql, engine, geom_col="geometry")
+ validate_boro_df(df)
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
"""
- pass
+ engine = engine_postgis
- @pytest.mark.parametrize('engine_postgis', POSTGIS_DRIVERS, indirect=True)
- @pytest.mark.xfail(compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
- reason=
- 'Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1'
- )
+ table = "nybb"
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+
+ # Reproject
+ df_nybb2 = df_nybb.to_crs(epsg=4326)
+
+ # Should raise error when appending
+ with pytest.raises(ValueError, match="CRS of the target table"):
+ write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_append_without_crs(self, engine_postgis, df_nybb):
+ # This test was included in #3328 when the default value for no
+ # CRS was changed from an SRID of -1 to 0. This resolves issues
+ # of appending dataframes to postgis that have no CRS as postgis
+ # no CRS value is 0.
+ engine = engine_postgis
+ df_nybb = df_nybb.set_crs(None, allow_override=True)
+ table = "nybb"
+
+ write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
+ # append another dataframe with no crs
+
+ df_nybb2 = df_nybb
+ write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
+
+ @pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
+ @pytest.mark.xfail(
+ compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
+ reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
+ )
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
Tests that a ValueError is raised if an SQL query returns two geometry columns.
"""
- pass
+ engine = engine_postgis
+
+ sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
+
+ with pytest.raises(ValueError):
+ read_postgis(sql, engine, geom_col="geom")
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
+ con = connection_postgis
+ df_nybb = df_nybb.to_crs(crs="esri:54052")
+ create_postgis(con, df_nybb, srid=54052)
+
+ sql = "SELECT * FROM nybb;"
+ df = read_postgis(sql, con)
+ validate_boro_df(df)
+ assert df.crs == "ESRI:54052"
- @pytest.mark.parametrize('connection_postgis', POSTGIS_DRIVERS,
- indirect=True)
+ @pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
+ @mock.patch("shapely.get_srid")
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
+ # mock a non-existent srid for edge case if shapely has an srid
+ # not present in postgis table.
+ pyproj = pytest.importorskip("pyproj")
+
+ mock_get_srid.return_value = 99999
+
+ con = connection_postgis
+ df_nybb = df_nybb.to_crs(crs="epsg:4326")
+ create_postgis(con, df_nybb)
+
+ sql = "SELECT * FROM nybb;"
+ with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
+ with pytest.warns(UserWarning, match="Could not find srid 99999"):
+ read_postgis(sql, con)
+
+ @mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
+ def test_read_no_spatial_ref_sys_table_in_postgis(
+ self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
+ ):
+ # mock for a non-existent spatial_ref_sys database
+
+ mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
+
+ con = connection_postgis
+ df_nybb = df_nybb.to_crs(crs="epsg:4326")
+ create_postgis(con, df_nybb, srid=4326)
+
+ sql = "SELECT * FROM nybb;"
+ with pytest.warns(
+ UserWarning, match="Could not find the spatial reference system table"
+ ):
+ df = read_postgis(sql, con)
+
+ assert df.crs == "EPSG:4326"
+
+ @pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument with non epsg crs"""
- pass
+ chunksize = 2
+ con = connection_postgis
+ df_nybb = df_nybb.to_crs(crs="esri:54052")
+
+ create_postgis(con, df_nybb, srid=54052)
+
+ sql = "SELECT * FROM nybb;"
+ df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
+
+ validate_boro_df(df)
+ assert df.crs == "ESRI:54052"
diff --git a/geopandas/io/util.py b/geopandas/io/util.py
index a13ec40c..86ecf69c 100644
--- a/geopandas/io/util.py
+++ b/geopandas/io/util.py
@@ -1,22 +1,58 @@
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
+
import re
import sys
from urllib.parse import urlparse
-def vsi_path(path: str) ->str:
+def vsi_path(path: str) -> str:
"""
Ensure path is a local path or a GDAL-compatible vsi path.
"""
- pass
+ # path is already in GDAL format
+ if path.startswith("/vsi"):
+ return path
+
+ # Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
+ # URL schemes
+ if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
+ if not path.split("!")[0].endswith(".zip"):
+ return path
+
+ # prefix then allow to proceed with remaining parsing
+ path = f"zip://{path}"
+
+ path, archive, scheme = _parse_uri(path)
+
+ if scheme or archive or path.endswith(".zip"):
+ return _construct_vsi_path(path, archive, scheme)
-SCHEMES = {'file': 'file', 'zip': 'zip', 'tar': 'tar', 'gzip': 'gzip',
- 'http': 'curl', 'https': 'curl', 'ftp': 'curl', 's3': 's3', 'gs': 'gs',
- 'az': 'az', 'adls': 'adls', 'adl': 'adls', 'hdfs': 'hdfs', 'webhdfs':
- 'webhdfs'}
-CURLSCHEMES = {k for k, v in SCHEMES.items() if v == 'curl'}
+ return path
+
+
+# Supported URI schemes and their mapping to GDAL's VSI suffix.
+SCHEMES = {
+ "file": "file",
+ "zip": "zip",
+ "tar": "tar",
+ "gzip": "gzip",
+ "http": "curl",
+ "https": "curl",
+ "ftp": "curl",
+ "s3": "s3",
+ "gs": "gs",
+ "az": "az",
+ "adls": "adls",
+ "adl": "adls", # fsspec uses this
+ "hdfs": "hdfs",
+ "webhdfs": "webhdfs",
+ # GDAL additionally supports oss and swift for remote filesystems, but
+ # those are for now not added as supported URI
+}
+
+CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
def _parse_uri(path: str):
@@ -33,9 +69,50 @@ def _parse_uri(path: str):
scheme : str
URI scheme such as "https" or "zip+s3".
"""
- pass
+ parts = urlparse(path, allow_fragments=False)
+
+ # if the scheme is not one of GDAL's supported schemes, return raw path
+ if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
+ return path, "", ""
+
+ # we have a URI
+ path = parts.path
+ scheme = parts.scheme or ""
+ if parts.query:
+ path += "?" + parts.query
-def _construct_vsi_path(path, archive, scheme) ->str:
+ if parts.scheme and parts.netloc:
+ path = parts.netloc + path
+
+ parts = path.split("!")
+ path = parts.pop() if parts else ""
+ archive = parts.pop() if parts else ""
+ return (path, archive, scheme)
+
+
+def _construct_vsi_path(path, archive, scheme) -> str:
"""Convert a parsed path to a GDAL VSI path"""
- pass
+
+ prefix = ""
+ suffix = ""
+ schemes = scheme.split("+")
+
+ if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
+ schemes.insert(0, "zip")
+
+ if schemes:
+ prefix = "/".join(
+ "vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
+ )
+
+ if schemes[-1] in CURLSCHEMES:
+ suffix = f"{schemes[-1]}://"
+
+ if prefix:
+ if archive:
+ return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
+ else:
+ return "/{}/{}{}".format(prefix, suffix, path)
+
+ return path
diff --git a/geopandas/plotting.py b/geopandas/plotting.py
index 974bdc10..5c2f416e 100644
--- a/geopandas/plotting.py
+++ b/geopandas/plotting.py
@@ -1,14 +1,17 @@
import warnings
from packaging.version import Version
+
import numpy as np
import pandas as pd
from pandas import CategoricalDtype
from pandas.plotting import PlotAccessor
+
import geopandas
+
from ._decorator import doc
-def _sanitize_geoms(geoms, prefix='Multi'):
+def _sanitize_geoms(geoms, prefix="Multi"):
"""
Returns Series like geoms and index, except that any Multi geometries
are split into their components and indices are repeated for all component
@@ -25,7 +28,29 @@ def _sanitize_geoms(geoms, prefix='Multi'):
component_index : index array
indices are repeated for all components in the same Multi geometry
"""
- pass
+ # TODO(shapely) look into simplifying this with
+ # shapely.get_parts(geoms, return_index=True) from shapely 2.0
+ components, component_index = [], []
+
+ if (
+ not geoms.geom_type.str.startswith(prefix).any()
+ and not geoms.is_empty.any()
+ and not geoms.isna().any()
+ ):
+ return geoms, np.arange(len(geoms))
+
+ for ix, geom in enumerate(geoms):
+ if geom is not None and geom.geom_type.startswith(prefix) and not geom.is_empty:
+ for poly in geom.geoms:
+ components.append(poly)
+ component_index.append(ix)
+ elif geom is None or geom.is_empty:
+ continue
+ else:
+ components.append(geom)
+ component_index.append(ix)
+
+ return components, np.array(component_index)
def _expand_kwargs(kwargs, multiindex):
@@ -35,7 +60,29 @@ def _expand_kwargs(kwargs, multiindex):
it (in place) to the correct length/formats with help of 'multiindex', unless
the value appears to already be a valid (single) value for the key.
"""
- pass
+ from typing import Iterable
+
+ from matplotlib.colors import is_color_like
+
+ scalar_kwargs = ["marker", "path_effects"]
+ for att, value in kwargs.items():
+ if "color" in att: # color(s), edgecolor(s), facecolor(s)
+ if is_color_like(value):
+ continue
+ elif "linestyle" in att: # linestyle(s)
+ # A single linestyle can be 2-tuple of a number and an iterable.
+ if (
+ isinstance(value, tuple)
+ and len(value) == 2
+ and isinstance(value[1], Iterable)
+ ):
+ continue
+ elif att in scalar_kwargs:
+ # For these attributes, only a single value is allowed, so never expand.
+ continue
+
+ if pd.api.types.is_list_like(value):
+ kwargs[att] = np.take(value, multiindex, axis=0)
def _PolygonPatch(polygon, **kwargs):
@@ -54,11 +101,27 @@ def _PolygonPatch(polygon, **kwargs):
(BSD license, https://pypi.org/project/descartes) for PolygonPatch, but
this dependency was removed in favor of the below matplotlib code.
"""
- pass
-
-
-def _plot_polygon_collection(ax, geoms, values=None, color=None, cmap=None,
- vmin=None, vmax=None, autolim=True, **kwargs):
+ from matplotlib.patches import PathPatch
+ from matplotlib.path import Path
+
+ path = Path.make_compound_path(
+ Path(np.asarray(polygon.exterior.coords)[:, :2]),
+ *[Path(np.asarray(ring.coords)[:, :2]) for ring in polygon.interiors],
+ )
+ return PathPatch(path, **kwargs)
+
+
+def _plot_polygon_collection(
+ ax,
+ geoms,
+ values=None,
+ color=None,
+ cmap=None,
+ vmin=None,
+ vmax=None,
+ autolim=True,
+ **kwargs,
+):
"""
Plots a collection of Polygon and MultiPolygon geometries to `ax`
@@ -87,11 +150,49 @@ def _plot_polygon_collection(ax, geoms, values=None, color=None, cmap=None,
-------
collection : matplotlib.collections.Collection that was plotted
"""
- pass
-
-
-def _plot_linestring_collection(ax, geoms, values=None, color=None, cmap=
- None, vmin=None, vmax=None, autolim=True, **kwargs):
+ from matplotlib.collections import PatchCollection
+
+ geoms, multiindex = _sanitize_geoms(geoms)
+ if values is not None:
+ values = np.take(values, multiindex, axis=0)
+
+ # PatchCollection does not accept some kwargs.
+ kwargs = {
+ att: value
+ for att, value in kwargs.items()
+ if att not in ["markersize", "marker"]
+ }
+
+ # Add to kwargs for easier checking below.
+ if color is not None:
+ kwargs["color"] = color
+
+ _expand_kwargs(kwargs, multiindex)
+
+ collection = PatchCollection([_PolygonPatch(poly) for poly in geoms], **kwargs)
+
+ if values is not None:
+ collection.set_array(np.asarray(values))
+ collection.set_cmap(cmap)
+ if "norm" not in kwargs:
+ collection.set_clim(vmin, vmax)
+
+ ax.add_collection(collection, autolim=autolim)
+ ax.autoscale_view()
+ return collection
+
+
+def _plot_linestring_collection(
+ ax,
+ geoms,
+ values=None,
+ color=None,
+ cmap=None,
+ vmin=None,
+ vmax=None,
+ autolim=True,
+ **kwargs,
+):
"""
Plots a collection of LineString and MultiLineString geometries to `ax`
@@ -113,11 +214,51 @@ def _plot_linestring_collection(ax, geoms, values=None, color=None, cmap=
-------
collection : matplotlib.collections.Collection that was plotted
"""
- pass
-
-
-def _plot_point_collection(ax, geoms, values=None, color=None, cmap=None,
- vmin=None, vmax=None, marker='o', markersize=None, **kwargs):
+ from matplotlib.collections import LineCollection
+
+ geoms, multiindex = _sanitize_geoms(geoms)
+ if values is not None:
+ values = np.take(values, multiindex, axis=0)
+
+ # LineCollection does not accept some kwargs.
+ kwargs = {
+ att: value
+ for att, value in kwargs.items()
+ if att not in ["markersize", "marker"]
+ }
+
+ # Add to kwargs for easier checking below.
+ if color is not None:
+ kwargs["color"] = color
+
+ _expand_kwargs(kwargs, multiindex)
+
+ segments = [np.array(linestring.coords)[:, :2] for linestring in geoms]
+ collection = LineCollection(segments, **kwargs)
+
+ if values is not None:
+ collection.set_array(np.asarray(values))
+ collection.set_cmap(cmap)
+ if "norm" not in kwargs:
+ collection.set_clim(vmin, vmax)
+
+ ax.add_collection(collection, autolim=autolim)
+ ax.autoscale_view()
+ return collection
+
+
+def _plot_point_collection(
+ ax,
+ geoms,
+ values=None,
+ color=None,
+ cmap=None,
+ vmin=None,
+ vmax=None,
+ marker="o",
+ markersize=None,
+ **kwargs,
+):
"""
Plots a collection of Point and MultiPoint geometries to `ax`
@@ -139,11 +280,46 @@ def _plot_point_collection(ax, geoms, values=None, color=None, cmap=None,
-------
collection : matplotlib.collections.Collection that was plotted
"""
- pass
-
-
-def plot_series(s, cmap=None, color=None, ax=None, figsize=None, aspect=
- 'auto', autolim=True, **style_kwds):
+ if values is not None and color is not None:
+ raise ValueError("Can only specify one of 'values' and 'color' kwargs")
+
+ geoms, multiindex = _sanitize_geoms(geoms)
+ # values are expanded below as kwargs["c"]
+
+ x = [p.x if not p.is_empty else None for p in geoms]
+ y = [p.y if not p.is_empty else None for p in geoms]
+
+ # matplotlib 1.4 does not support c=None, and < 2.0 does not support s=None
+ if values is not None:
+ kwargs["c"] = values
+ if markersize is not None:
+ kwargs["s"] = markersize
+
+ # Add to kwargs for easier checking below.
+ if color is not None:
+ kwargs["color"] = color
+ if marker is not None:
+ kwargs["marker"] = marker
+ _expand_kwargs(kwargs, multiindex)
+
+ if "norm" not in kwargs:
+ collection = ax.scatter(x, y, vmin=vmin, vmax=vmax, cmap=cmap, **kwargs)
+ else:
+ collection = ax.scatter(x, y, cmap=cmap, **kwargs)
+
+ return collection
+
+
+def plot_series(
+ s,
+ cmap=None,
+ color=None,
+ ax=None,
+ figsize=None,
+ aspect="auto",
+ autolim=True,
+ **style_kwds,
+):
"""
Plot a GeoSeries.
@@ -189,14 +365,147 @@ def plot_series(s, cmap=None, color=None, ax=None, figsize=None, aspect=
-------
ax : matplotlib axes instance
"""
- pass
-
-def plot_dataframe(df, column=None, cmap=None, color=None, ax=None, cax=
- None, categorical=False, legend=False, scheme=None, k=5, vmin=None,
- vmax=None, markersize=None, figsize=None, legend_kwds=None, categories=
- None, classification_kwds=None, missing_kwds=None, aspect='auto',
- autolim=True, **style_kwds):
+ try:
+ import matplotlib.pyplot as plt
+ except ImportError:
+ raise ImportError(
+ "The matplotlib package is required for plotting in geopandas. "
+ "You can install it using 'conda install -c conda-forge matplotlib' or "
+ "'pip install matplotlib'."
+ )
+
+ if ax is None:
+ fig, ax = plt.subplots(figsize=figsize)
+
+ if aspect == "auto":
+ if s.crs and s.crs.is_geographic:
+ bounds = s.total_bounds
+ y_coord = np.mean([bounds[1], bounds[3]])
+ ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
+ # formula ported from R package sp
+ # https://github.com/edzer/sp/blob/master/R/mapasp.R
+ else:
+ ax.set_aspect("equal")
+ elif aspect is not None:
+ ax.set_aspect(aspect)
+
+ if s.empty:
+ warnings.warn(
+ "The GeoSeries you are attempting to plot is "
+ "empty. Nothing has been displayed.",
+ UserWarning,
+ stacklevel=3,
+ )
+ return ax
+
+ if s.is_empty.all():
+ warnings.warn(
+ "The GeoSeries you are attempting to plot is "
+ "composed of empty geometries. Nothing has been displayed.",
+ UserWarning,
+ stacklevel=3,
+ )
+ return ax
+
+ # have colors been given for all geometries?
+ color_given = pd.api.types.is_list_like(color) and len(color) == len(s)
+
+ # if cmap is specified, create range of colors based on cmap
+ values = None
+ if cmap is not None:
+ values = np.arange(len(s))
+ if hasattr(cmap, "N"):
+ values = values % cmap.N
+ style_kwds["vmin"] = style_kwds.get("vmin", values.min())
+ style_kwds["vmax"] = style_kwds.get("vmax", values.max())
+
+ # decompose GeometryCollections
+ geoms, multiindex = _sanitize_geoms(s.geometry, prefix="Geom")
+ values = np.take(values, multiindex, axis=0) if cmap else None
+ # ensure indexes are consistent
+ if color_given and isinstance(color, pd.Series):
+ color = color.reindex(s.index)
+ expl_color = np.take(color, multiindex, axis=0) if color_given else color
+ expl_series = geopandas.GeoSeries(geoms)
+
+ geom_types = expl_series.geom_type
+ poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
+ line_idx = np.asarray(
+ (geom_types == "LineString")
+ | (geom_types == "MultiLineString")
+ | (geom_types == "LinearRing")
+ )
+ point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
+
+ # plot all Polygons and all MultiPolygon components in the same collection
+ polys = expl_series[poly_idx]
+ if not polys.empty:
+ # color overrides both face and edgecolor. As we want people to be
+ # able to use edgecolor as well, pass color to facecolor
+ facecolor = style_kwds.pop("facecolor", None)
+ color_ = expl_color[poly_idx] if color_given else color
+ if color is not None:
+ facecolor = color_
+
+ values_ = values[poly_idx] if cmap else None
+ _plot_polygon_collection(
+ ax,
+ polys,
+ values_,
+ facecolor=facecolor,
+ cmap=cmap,
+ autolim=autolim,
+ **style_kwds,
+ )
+
+ # plot all LineStrings and MultiLineString components in same collection
+ lines = expl_series[line_idx]
+ if not lines.empty:
+ values_ = values[line_idx] if cmap else None
+ color_ = expl_color[line_idx] if color_given else color
+
+ _plot_linestring_collection(
+ ax, lines, values_, color=color_, cmap=cmap, autolim=autolim, **style_kwds
+ )
+
+ # plot all Points in the same collection
+ points = expl_series[point_idx]
+ if not points.empty:
+ values_ = values[point_idx] if cmap else None
+ color_ = expl_color[point_idx] if color_given else color
+
+ _plot_point_collection(
+ ax, points, values_, color=color_, cmap=cmap, **style_kwds
+ )
+
+ ax.figure.canvas.draw_idle()
+ return ax
+
+
+def plot_dataframe(
+ df,
+ column=None,
+ cmap=None,
+ color=None,
+ ax=None,
+ cax=None,
+ categorical=False,
+ legend=False,
+ scheme=None,
+ k=5,
+ vmin=None,
+ vmax=None,
+ markersize=None,
+ figsize=None,
+ legend_kwds=None,
+ categories=None,
+ classification_kwds=None,
+ missing_kwds=None,
+ aspect="auto",
+ autolim=True,
+ **style_kwds,
+):
"""
Plot a GeoDataFrame.
@@ -326,7 +635,326 @@ def plot_dataframe(df, column=None, cmap=None, color=None, ax=None, cax=
See the User Guide page :doc:`../../user_guide/mapping` for details.
"""
- pass
+ if column is not None and color is not None:
+ warnings.warn(
+ "Only specify one of 'column' or 'color'. Using 'color'.",
+ UserWarning,
+ stacklevel=3,
+ )
+ column = None
+
+ try:
+ import matplotlib.pyplot as plt
+ except ImportError:
+ raise ImportError(
+ "The matplotlib package is required for plotting in geopandas. "
+ "You can install it using 'conda install -c conda-forge matplotlib' or "
+ "'pip install matplotlib'."
+ )
+
+ if ax is None:
+ if cax is not None:
+ raise ValueError("'ax' can not be None if 'cax' is not.")
+ fig, ax = plt.subplots(figsize=figsize)
+
+ if aspect == "auto":
+ if df.crs and df.crs.is_geographic:
+ bounds = df.total_bounds
+ y_coord = np.mean([bounds[1], bounds[3]])
+ ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
+ # formula ported from R package sp
+ # https://github.com/edzer/sp/blob/master/R/mapasp.R
+ else:
+ ax.set_aspect("equal")
+ elif aspect is not None:
+ ax.set_aspect(aspect)
+
+ # GH 1555
+ # if legend_kwds set, copy so we don't update it in place
+ if legend_kwds is not None:
+ legend_kwds = legend_kwds.copy()
+
+ if df.empty:
+ warnings.warn(
+ "The GeoDataFrame you are attempting to plot is "
+ "empty. Nothing has been displayed.",
+ UserWarning,
+ stacklevel=3,
+ )
+ return ax
+
+ if isinstance(markersize, str):
+ markersize = df[markersize].values
+
+ if column is None:
+ return plot_series(
+ df.geometry,
+ cmap=cmap,
+ color=color,
+ ax=ax,
+ figsize=figsize,
+ markersize=markersize,
+ aspect=aspect,
+ autolim=autolim,
+ **style_kwds,
+ )
+
+ # To accept pd.Series and np.arrays as column
+ if isinstance(column, (np.ndarray, pd.Series)):
+ if column.shape[0] != df.shape[0]:
+ raise ValueError(
+ "The dataframe and given column have different number of rows."
+ )
+ else:
+ values = column
+
+ # Make sure index of a Series matches index of df
+ if isinstance(values, pd.Series):
+ values = values.reindex(df.index)
+ else:
+ values = df[column]
+
+ if isinstance(values.dtype, CategoricalDtype):
+ if categories is not None:
+ raise ValueError(
+ "Cannot specify 'categories' when column has categorical dtype"
+ )
+ categorical = True
+ elif (
+ pd.api.types.is_object_dtype(values.dtype)
+ or pd.api.types.is_bool_dtype(values.dtype)
+ or pd.api.types.is_string_dtype(values.dtype)
+ or categories
+ ):
+ categorical = True
+
+ nan_idx = np.asarray(pd.isna(values), dtype="bool")
+
+ if scheme is not None:
+ mc_err = (
+ "The 'mapclassify' package (>= 2.4.0) is "
+ "required to use the 'scheme' keyword."
+ )
+ try:
+ import mapclassify
+
+ except ImportError:
+ raise ImportError(mc_err)
+
+ if Version(mapclassify.__version__) < Version("2.4.0"):
+ raise ImportError(mc_err)
+
+ if classification_kwds is None:
+ classification_kwds = {}
+ if "k" not in classification_kwds:
+ classification_kwds["k"] = k
+
+ binning = mapclassify.classify(
+ np.asarray(values[~nan_idx]), scheme, **classification_kwds
+ )
+ # set categorical to True for creating the legend
+ categorical = True
+ if legend_kwds is not None and "labels" in legend_kwds:
+ if len(legend_kwds["labels"]) != binning.k:
+ raise ValueError(
+ "Number of labels must match number of bins, "
+ "received {} labels for {} bins".format(
+ len(legend_kwds["labels"]), binning.k
+ )
+ )
+ else:
+ labels = list(legend_kwds.pop("labels"))
+ else:
+ fmt = "{:.2f}"
+ if legend_kwds is not None and "fmt" in legend_kwds:
+ fmt = legend_kwds.pop("fmt")
+
+ labels = binning.get_legend_classes(fmt)
+ if legend_kwds is not None:
+ show_interval = legend_kwds.pop("interval", False)
+ else:
+ show_interval = False
+ if not show_interval:
+ labels = [c[1:-1] for c in labels]
+
+ values = pd.Categorical(
+ [np.nan] * len(values), categories=binning.bins, ordered=True
+ )
+ values[~nan_idx] = pd.Categorical.from_codes(
+ binning.yb, categories=binning.bins, ordered=True
+ )
+ if cmap is None:
+ cmap = "viridis"
+
+ # Define `values` as a Series
+ if categorical:
+ if cmap is None:
+ cmap = "tab10"
+
+ cat = pd.Categorical(values, categories=categories)
+ categories = list(cat.categories)
+
+ # values missing in the Categorical but not in original values
+ missing = list(np.unique(values[~nan_idx & cat.isna()]))
+ if missing:
+ raise ValueError(
+ "Column contains values not listed in categories. "
+ "Missing categories: {}.".format(missing)
+ )
+
+ values = cat.codes[~nan_idx]
+ vmin = 0 if vmin is None else vmin
+ vmax = len(categories) - 1 if vmax is None else vmax
+
+ # fill values with placeholder where were NaNs originally to map them properly
+ # (after removing them in categorical or scheme)
+ if categorical:
+ for n in np.where(nan_idx)[0]:
+ values = np.insert(values, n, values[0])
+
+ mn = values[~np.isnan(values)].min() if vmin is None else vmin
+ mx = values[~np.isnan(values)].max() if vmax is None else vmax
+
+ # decompose GeometryCollections
+ geoms, multiindex = _sanitize_geoms(df.geometry, prefix="Geom")
+ values = np.take(values, multiindex, axis=0)
+ nan_idx = np.take(nan_idx, multiindex, axis=0)
+ expl_series = geopandas.GeoSeries(geoms)
+
+ geom_types = expl_series.geom_type
+ poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
+ line_idx = np.asarray(
+ (geom_types == "LineString")
+ | (geom_types == "MultiLineString")
+ | (geom_types == "LinearRing")
+ )
+ point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
+
+ # plot all Polygons and all MultiPolygon components in the same collection
+ polys = expl_series[poly_idx & np.invert(nan_idx)]
+ subset = values[poly_idx & np.invert(nan_idx)]
+ if not polys.empty:
+ _plot_polygon_collection(
+ ax,
+ polys,
+ subset,
+ vmin=mn,
+ vmax=mx,
+ cmap=cmap,
+ autolim=autolim,
+ **style_kwds,
+ )
+
+ # plot all LineStrings and MultiLineString components in same collection
+ lines = expl_series[line_idx & np.invert(nan_idx)]
+ subset = values[line_idx & np.invert(nan_idx)]
+ if not lines.empty:
+ _plot_linestring_collection(
+ ax,
+ lines,
+ subset,
+ vmin=mn,
+ vmax=mx,
+ cmap=cmap,
+ autolim=autolim,
+ **style_kwds,
+ )
+
+ # plot all Points in the same collection
+ points = expl_series[point_idx & np.invert(nan_idx)]
+ subset = values[point_idx & np.invert(nan_idx)]
+ if not points.empty:
+ if isinstance(markersize, np.ndarray):
+ markersize = np.take(markersize, multiindex, axis=0)
+ markersize = markersize[point_idx & np.invert(nan_idx)]
+ _plot_point_collection(
+ ax,
+ points,
+ subset,
+ vmin=mn,
+ vmax=mx,
+ markersize=markersize,
+ cmap=cmap,
+ **style_kwds,
+ )
+
+ missing_data = not expl_series[nan_idx].empty
+ if missing_kwds is not None and missing_data:
+ if color:
+ if "color" not in missing_kwds:
+ missing_kwds["color"] = color
+
+ merged_kwds = style_kwds.copy()
+ merged_kwds.update(missing_kwds)
+
+ plot_series(expl_series[nan_idx], ax=ax, **merged_kwds)
+
+ if legend and not color:
+ if legend_kwds is None:
+ legend_kwds = {}
+ if "fmt" in legend_kwds:
+ legend_kwds.pop("fmt")
+
+ from matplotlib import cm
+ from matplotlib.colors import Normalize
+ from matplotlib.lines import Line2D
+
+ norm = style_kwds.get("norm", None)
+ if not norm:
+ norm = Normalize(vmin=mn, vmax=mx)
+ n_cmap = cm.ScalarMappable(norm=norm, cmap=cmap)
+ if categorical:
+ if scheme is not None:
+ categories = labels
+ patches = []
+ for i in range(len(categories)):
+ patches.append(
+ Line2D(
+ [0],
+ [0],
+ linestyle="none",
+ marker="o",
+ alpha=style_kwds.get("alpha", 1),
+ markersize=10,
+ markerfacecolor=n_cmap.to_rgba(i),
+ markeredgewidth=0,
+ )
+ )
+ if missing_kwds is not None and missing_data:
+ if "color" in merged_kwds:
+ merged_kwds["facecolor"] = merged_kwds["color"]
+ patches.append(
+ Line2D(
+ [0],
+ [0],
+ linestyle="none",
+ marker="o",
+ alpha=merged_kwds.get("alpha", 1),
+ markersize=10,
+ markerfacecolor=merged_kwds.get("facecolor", None),
+ markeredgecolor=merged_kwds.get("edgecolor", None),
+ markeredgewidth=merged_kwds.get(
+ "linewidth", 1 if merged_kwds.get("edgecolor", False) else 0
+ ),
+ )
+ )
+ categories.append(merged_kwds.get("label", "NaN"))
+ legend_kwds.setdefault("numpoints", 1)
+ legend_kwds.setdefault("loc", "best")
+ legend_kwds.setdefault("handles", patches)
+ legend_kwds.setdefault("labels", categories)
+ ax.legend(**legend_kwds)
+ else:
+ if cax is not None:
+ legend_kwds.setdefault("cax", cax)
+ else:
+ legend_kwds.setdefault("ax", ax)
+
+ n_cmap.set_array(np.array([]))
+ ax.get_figure().colorbar(n_cmap, **legend_kwds)
+
+ ax.figure.canvas.draw_idle()
+ return ax
@doc(plot_dataframe)
@@ -335,10 +963,15 @@ class GeoplotAccessor(PlotAccessor):
def __call__(self, *args, **kwargs):
data = self._parent.copy()
- kind = kwargs.pop('kind', 'geo')
- if kind == 'geo':
+ kind = kwargs.pop("kind", "geo")
+ if kind == "geo":
return plot_dataframe(data, *args, **kwargs)
if kind in self._pandas_kinds:
+ # Access pandas plots
return PlotAccessor(data)(kind=kind, **kwargs)
else:
- raise ValueError(f'{kind} is not a valid plot kind')
+ # raise error
+ raise ValueError(f"{kind} is not a valid plot kind")
+
+ def geo(self, *args, **kwargs):
+ return self(kind="geo", *args, **kwargs) # noqa: B026
diff --git a/geopandas/sindex.py b/geopandas/sindex.py
index f72d4f2a..6966cc46 100644
--- a/geopandas/sindex.py
+++ b/geopandas/sindex.py
@@ -1,11 +1,15 @@
import numpy as np
+
import shapely
from shapely.geometry.base import BaseGeometry
+
from . import _compat as compat
from . import array, geoseries
+
PREDICATES = {p.name for p in shapely.strtree.BinaryPredicate} | {None}
+
if compat.GEOS_GE_310:
- PREDICATES.update(['dwithin'])
+ PREDICATES.update(["dwithin"])
class SpatialIndex:
@@ -19,9 +23,15 @@ class SpatialIndex:
"""
def __init__(self, geometry):
+ # set empty geometries to None to avoid segfault on GEOS <= 3.6
+ # see:
+ # https://github.com/pygeos/pygeos/issues/146
+ # https://github.com/pygeos/pygeos/issues/147
non_empty = geometry.copy()
non_empty[shapely.is_empty(non_empty)] = None
+ # set empty geometries to None to maintain indexing
self._tree = shapely.STRtree(non_empty)
+ # store geometries, including empty geometries for user access
self.geometries = geometry.copy()
@property
@@ -38,12 +48,14 @@ class SpatialIndex:
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries([Point(0, 0), Point(1, 1)])
>>> s.sindex.valid_query_predicates # doctest: +SKIP
- {None, "contains", "contains_properly", "covered_by", "covers", "crosses", "dwithin", "intersects", "overlaps", "touches", "within"}
+ {None, "contains", "contains_properly", "covered_by", "covers", \
+"crosses", "dwithin", "intersects", "overlaps", "touches", "within"}
"""
- pass
+ return PREDICATES
- def query(self, geometry, predicate=None, sort=False, distance=None,
- output_format='tuple'):
+ def query(
+ self, geometry, predicate=None, sort=False, distance=None, output_format="tuple"
+ ):
"""
Return the integer indices of all combinations of each input geometry
and tree geometries where the bounding box of each input geometry
@@ -73,12 +85,14 @@ class SpatialIndex:
Parameters
----------
- geometry : shapely.Geometry or array-like of geometries (numpy.ndarray, GeoSeries, GeometryArray)
+ geometry : shapely.Geometry or array-like of geometries \
+(numpy.ndarray, GeoSeries, GeometryArray)
A single shapely geometry or array of geometries to query against
the spatial index. For array-like, accepts both GeoPandas geometry
iterables (GeoSeries, GeometryArray) or a numpy array of Shapely
geometries.
- predicate : {None, "contains", "contains_properly", "covered_by", "covers", "crosses", "intersects", "overlaps", "touches", "within", "dwithin"}, optional
+ predicate : {None, "contains", "contains_properly", "covered_by", "covers", \
+"crosses", "intersects", "overlaps", "touches", "within", "dwithin"}, optional
If predicate is provided, the input geometries are tested
using the predicate function against each item in the tree
whose extent intersects the envelope of the input geometry:
@@ -165,7 +179,68 @@ class SpatialIndex:
geometries that can be joined based on overlapping bounding boxes or
optional predicate are returned.
"""
- pass
+ if predicate not in self.valid_query_predicates:
+ if predicate == "dwithin":
+ raise ValueError("predicate = 'dwithin' requires GEOS >= 3.10.0")
+
+ raise ValueError(
+ "Got predicate='{}'; ".format(predicate)
+ + "`predicate` must be one of {}".format(self.valid_query_predicates)
+ )
+
+ # distance argument requirement of predicate `dwithin`
+ # and only valid for predicate `dwithin`
+ kwargs = {}
+ if predicate == "dwithin":
+ if distance is None:
+ # the distance parameter is needed
+ raise ValueError(
+ "'distance' parameter is required for 'dwithin' predicate"
+ )
+ # add distance to kwargs
+ kwargs["distance"] = distance
+
+ elif distance is not None:
+ # distance parameter is invalid
+ raise ValueError(
+ "'distance' parameter is only supported in combination with "
+ "'dwithin' predicate"
+ )
+
+ geometry = self._as_geometry_array(geometry)
+
+ indices = self._tree.query(geometry, predicate=predicate, **kwargs)
+
+ if output_format != "tuple":
+ sort = True
+
+ if sort:
+ if indices.ndim == 1:
+ indices = np.sort(indices)
+ else:
+ # sort by first array (geometry) and then second (tree)
+ geo_idx, tree_idx = indices
+ sort_indexer = np.lexsort((tree_idx, geo_idx))
+ indices = np.vstack((geo_idx[sort_indexer], tree_idx[sort_indexer]))
+
+ if output_format == "sparse":
+ from scipy.sparse import coo_array
+
+ return coo_array(
+ (np.ones(len(indices[0]), dtype=np.bool_), indices),
+ shape=(len(self.geometries), len(geometry)),
+ dtype=np.bool_,
+ )
+
+ if output_format == "dense":
+ dense = np.zeros((len(self.geometries), len(geometry)), dtype=bool)
+ dense[indices] = True
+ return dense
+
+ if output_format == "tuple":
+ return indices
+
+ raise ValueError("Invalid output_format: {}".format(output_format))
@staticmethod
def _as_geometry_array(geometry):
@@ -182,10 +257,27 @@ class SpatialIndex:
np.ndarray
A numpy array of Shapely geometries.
"""
- pass
-
- def nearest(self, geometry, return_all=True, max_distance=None,
- return_distance=False, exclusive=False):
+ if isinstance(geometry, np.ndarray):
+ return array.from_shapely(geometry)._data
+ elif isinstance(geometry, geoseries.GeoSeries):
+ return geometry.values._data
+ elif isinstance(geometry, array.GeometryArray):
+ return geometry._data
+ elif isinstance(geometry, BaseGeometry):
+ return geometry
+ elif geometry is None:
+ return None
+ else:
+ return np.asarray(geometry)
+
+ def nearest(
+ self,
+ geometry,
+ return_all=True,
+ max_distance=None,
+ return_distance=False,
+ exclusive=False,
+ ):
"""
Return the nearest geometry in the tree for each input geometry in
``geometry``.
@@ -209,7 +301,8 @@ class SpatialIndex:
Parameters
----------
- geometry : {shapely.geometry, GeoSeries, GeometryArray, numpy.array of Shapely geometries}
+ geometry : {shapely.geometry, GeoSeries, GeometryArray, numpy.array of Shapely \
+geometries}
A single shapely geometry, one of the GeoPandas geometry iterables
(GeoSeries, GeometryArray), or a numpy array of Shapely geometries to query
against the spatial index.
@@ -264,7 +357,26 @@ class SpatialIndex:
array([[0, 1],
[8, 9]])
"""
- pass
+ geometry = self._as_geometry_array(geometry)
+ if isinstance(geometry, BaseGeometry) or geometry is None:
+ geometry = [geometry]
+
+ result = self._tree.query_nearest(
+ geometry,
+ max_distance=max_distance,
+ return_distance=return_distance,
+ all_matches=return_all,
+ exclusive=exclusive,
+ )
+ if return_distance:
+ indices, distances = result
+ else:
+ indices = result
+
+ if return_distance:
+ return indices, distances
+ else:
+ return indices
def intersection(self, coordinates):
"""Compatibility wrapper for rtree.index.Index.intersection,
@@ -302,7 +414,34 @@ class SpatialIndex:
array([1, 2, 3])
"""
- pass
+ # TODO: we should deprecate this
+ # convert bounds to geometry
+ # the old API uses tuples of bound, but Shapely uses geometries
+ try:
+ iter(coordinates)
+ except TypeError:
+ # likely not an iterable
+ # this is a check that rtree does, we mimic it
+ # to ensure a useful failure message
+ raise TypeError(
+ "Invalid coordinates, must be iterable in format "
+ "(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
+ "Got `coordinates` = {}.".format(coordinates)
+ )
+
+ # need to convert tuple of bounds to a geometry object
+ if len(coordinates) == 4:
+ indexes = self._tree.query(shapely.box(*coordinates))
+ elif len(coordinates) == 2:
+ indexes = self._tree.query(shapely.points(*coordinates))
+ else:
+ raise TypeError(
+ "Invalid coordinates, must be iterable in format "
+ "(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
+ "Got `coordinates` = {}.".format(coordinates)
+ )
+
+ return indexes
@property
def size(self):
@@ -330,7 +469,7 @@ class SpatialIndex:
>>> s.sindex.size
10
"""
- pass
+ return len(self._tree)
@property
def is_empty(self):
@@ -360,7 +499,7 @@ class SpatialIndex:
>>> s2.sindex.is_empty
True
"""
- pass
+ return len(self._tree) == 0
def __len__(self):
return len(self._tree)
diff --git a/geopandas/testing.py b/geopandas/testing.py
index 582d8a23..62328e45 100644
--- a/geopandas/testing.py
+++ b/geopandas/testing.py
@@ -1,15 +1,31 @@
"""
Testing functionality for geopandas objects.
"""
+
import warnings
+
import pandas as pd
+
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import GeometryDtype
def _isna(this):
"""isna version that works for both scalars and (Geo)Series"""
- pass
+ with warnings.catch_warnings():
+ # GeoSeries.isna will raise a warning about no longer returning True
+ # for empty geometries. This helper is used below always in combination
+ # with an is_empty check to preserve behaviour, and thus we ignore the
+ # warning here to avoid it bubbling up to the user
+ warnings.filterwarnings(
+ "ignore", r"GeoSeries.isna\(\) previously returned", UserWarning
+ )
+ if hasattr(this, "isna"):
+ return this.isna()
+ elif hasattr(this, "isnull"):
+ return this.isnull()
+ else:
+ return pd.isnull(this)
def _geom_equals_mask(this, that):
@@ -27,7 +43,12 @@ def _geom_equals_mask(this, that):
Series
boolean Series, True if geometries in left equal geometries in right
"""
- pass
+
+ return (
+ this.geom_equals(that)
+ | (this.is_empty & that.is_empty)
+ | (_isna(this) & _isna(that))
+ )
def geom_equals(this, that):
@@ -45,7 +66,8 @@ def geom_equals(this, that):
bool
True if all geometries in left equal geometries in right
"""
- pass
+
+ return _geom_equals_mask(this, that).all()
def _geom_almost_equals_mask(this, that):
@@ -65,7 +87,12 @@ def _geom_almost_equals_mask(this, that):
Series
boolean Series, True if geometries in left almost equal geometries in right
"""
- pass
+
+ return (
+ this.geom_equals_exact(that, tolerance=0.5 * 10 ** (-6))
+ | (this.is_empty & that.is_empty)
+ | (_isna(this) & _isna(that))
+ )
def geom_almost_equals(this, that):
@@ -86,12 +113,24 @@ def geom_almost_equals(this, that):
bool
True if all geometries in left almost equal geometries in right
"""
- pass
+ if isinstance(this, GeoDataFrame) and isinstance(that, GeoDataFrame):
+ this = this.geometry
+ that = that.geometry
+
+ return _geom_almost_equals_mask(this, that).all()
-def assert_geoseries_equal(left, right, check_dtype=True, check_index_type=
- False, check_series_type=True, check_less_precise=False,
- check_geom_type=False, check_crs=True, normalize=False):
+def assert_geoseries_equal(
+ left,
+ right,
+ check_dtype=True,
+ check_index_type=False,
+ check_series_type=True,
+ check_less_precise=False,
+ check_geom_type=False,
+ check_crs=True,
+ normalize=False,
+):
"""
Test util for checking that two GeoSeries are equal.
@@ -119,18 +158,100 @@ def assert_geoseries_equal(left, right, check_dtype=True, check_index_type=
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
- pass
+ assert len(left) == len(right), "%d != %d" % (len(left), len(right))
+
+ if check_dtype:
+ msg = "dtype should be a GeometryDtype, got {0}"
+ assert isinstance(left.dtype, GeometryDtype), msg.format(left.dtype)
+ assert isinstance(right.dtype, GeometryDtype), msg.format(left.dtype)
+
+ if check_index_type:
+ assert isinstance(left.index, type(right.index))
+
+ if check_series_type:
+ assert isinstance(left, GeoSeries)
+ assert isinstance(left, type(right))
+
+ if check_crs:
+ assert left.crs == right.crs
+ else:
+ if not isinstance(left, GeoSeries):
+ left = GeoSeries(left)
+ if not isinstance(right, GeoSeries):
+ right = GeoSeries(right, index=left.index)
+
+ assert left.index.equals(right.index), "index: %s != %s" % (left.index, right.index)
+
+ if check_geom_type:
+ assert (left.geom_type == right.geom_type).all(), "type: %s != %s" % (
+ left.geom_type,
+ right.geom_type,
+ )
+
+ if normalize:
+ left = GeoSeries(left.array.normalize())
+ right = GeoSeries(right.array.normalize())
+
+ if not check_crs:
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "CRS mismatch", UserWarning)
+ _check_equality(left, right, check_less_precise)
+ else:
+ _check_equality(left, right, check_less_precise)
def _truncated_string(geom):
"""Truncated WKT repr of geom"""
- pass
+ s = str(geom)
+ if len(s) > 100:
+ return s[:100] + "..."
+ else:
+ return s
+
+
+def _check_equality(left, right, check_less_precise):
+ assert_error_message = (
+ "{0} out of {1} geometries are not {3}equal.\n"
+ "Indices where geometries are not {3}equal: {2} \n"
+ "The first not {3}equal geometry:\n"
+ "Left: {4}\n"
+ "Right: {5}\n"
+ )
+ if check_less_precise:
+ precise = "almost "
+ equal = _geom_almost_equals_mask(left, right)
+ else:
+ precise = ""
+ equal = _geom_equals_mask(left, right)
+ if not equal.all():
+ unequal_left_geoms = left[~equal]
+ unequal_right_geoms = right[~equal]
+ raise AssertionError(
+ assert_error_message.format(
+ len(unequal_left_geoms),
+ len(left),
+ unequal_left_geoms.index.to_list(),
+ precise,
+ _truncated_string(unequal_left_geoms.iloc[0]),
+ _truncated_string(unequal_right_geoms.iloc[0]),
+ )
+ )
-def assert_geodataframe_equal(left, right, check_dtype=True,
- check_index_type='equiv', check_column_type='equiv', check_frame_type=
- True, check_like=False, check_less_precise=False, check_geom_type=False,
- check_crs=True, normalize=False):
+
+def assert_geodataframe_equal(
+ left,
+ right,
+ check_dtype=True,
+ check_index_type="equiv",
+ check_column_type="equiv",
+ check_frame_type=True,
+ check_like=False,
+ check_less_precise=False,
+ check_geom_type=False,
+ check_crs=True,
+ normalize=False,
+):
"""
Check that two GeoDataFrames are equal/
@@ -158,4 +279,80 @@ def assert_geodataframe_equal(left, right, check_dtype=True,
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
- pass
+ try:
+ # added from pandas 0.20
+ from pandas.testing import assert_frame_equal, assert_index_equal
+ except ImportError:
+ from pandas.util.testing import assert_frame_equal, assert_index_equal
+
+ # instance validation
+ if check_frame_type:
+ assert isinstance(left, GeoDataFrame)
+ assert isinstance(left, type(right))
+
+ if check_crs:
+ # allow if neither left and right has an active geometry column
+ if (
+ left._geometry_column_name is None
+ and right._geometry_column_name is None
+ ):
+ pass
+ elif (
+ left._geometry_column_name not in left.columns
+ and right._geometry_column_name not in right.columns
+ ):
+ pass
+ # no crs can be either None or {}
+ elif not left.crs and not right.crs:
+ pass
+ else:
+ assert left.crs == right.crs
+ else:
+ if not isinstance(left, GeoDataFrame):
+ left = GeoDataFrame(left)
+ if not isinstance(right, GeoDataFrame):
+ right = GeoDataFrame(right)
+
+ # shape comparison
+ assert left.shape == right.shape, (
+ "GeoDataFrame shape mismatch, left: {lshape!r}, right: {rshape!r}.\n"
+ "Left columns: {lcols!r}, right columns: {rcols!r}"
+ ).format(
+ lshape=left.shape, rshape=right.shape, lcols=left.columns, rcols=right.columns
+ )
+
+ if check_like:
+ left = left.reindex_like(right)
+
+ # column comparison
+ assert_index_equal(
+ left.columns, right.columns, exact=check_column_type, obj="GeoDataFrame.columns"
+ )
+
+ # geometry comparison
+ for col, dtype in left.dtypes.items():
+ if isinstance(dtype, GeometryDtype):
+ assert_geoseries_equal(
+ left[col],
+ right[col],
+ normalize=normalize,
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise,
+ check_geom_type=check_geom_type,
+ check_crs=check_crs,
+ )
+
+ # ensure the active geometry column is the same
+ assert left._geometry_column_name == right._geometry_column_name
+
+ # drop geometries and check remaining columns
+ left2 = left.select_dtypes(exclude="geometry")
+ right2 = right.select_dtypes(exclude="geometry")
+ assert_frame_equal(
+ left2,
+ right2,
+ check_dtype=check_dtype,
+ check_index_type=check_index_type,
+ check_column_type=check_column_type,
+ obj="GeoDataFrame",
+ )
diff --git a/geopandas/tools/_random.py b/geopandas/tools/_random.py
index b79a37a9..007d02fa 100644
--- a/geopandas/tools/_random.py
+++ b/geopandas/tools/_random.py
@@ -1,6 +1,9 @@
from warnings import warn
+
import numpy
+
from shapely.geometry import MultiPoint
+
from geopandas.array import from_shapely, points_from_xy
from geopandas.geoseries import GeoSeries
@@ -37,18 +40,45 @@ def uniform(geom, size, rng=None):
>>> square = box(0,0,1,1)
>>> uniform(square, size=102) # doctest: +SKIP
"""
- pass
+ generator = numpy.random.default_rng(seed=rng)
+
+ if geom is None or geom.is_empty:
+ return MultiPoint()
+
+ if geom.geom_type in ("Polygon", "MultiPolygon"):
+ return _uniform_polygon(geom, size=size, generator=generator)
+
+ if geom.geom_type in ("LineString", "MultiLineString"):
+ return _uniform_line(geom, size=size, generator=generator)
+
+ warn(
+ f"Sampling is not supported for {geom.geom_type} geometry type.",
+ UserWarning,
+ stacklevel=8,
+ )
+ return MultiPoint()
def _uniform_line(geom, size, generator):
"""
Sample points from an input shapely linestring
"""
- pass
+
+ fracs = generator.uniform(size=size)
+ return from_shapely(geom.interpolate(fracs, normalized=True)).union_all()
def _uniform_polygon(geom, size, generator):
"""
Sample uniformly from within a polygon using batched sampling.
"""
- pass
+ xmin, ymin, xmax, ymax = geom.bounds
+ candidates = []
+ while len(candidates) < size:
+ batch = points_from_xy(
+ x=generator.uniform(xmin, xmax, size=size),
+ y=generator.uniform(ymin, ymax, size=size),
+ )
+ valid_samples = batch[batch.sindex.query(geom, predicate="contains")]
+ candidates.extend(valid_samples)
+ return GeoSeries(candidates[:size]).union_all()
diff --git a/geopandas/tools/_show_versions.py b/geopandas/tools/_show_versions.py
index 661c2c22..26d02f3a 100644
--- a/geopandas/tools/_show_versions.py
+++ b/geopandas/tools/_show_versions.py
@@ -11,7 +11,15 @@ def _get_sys_info():
sys_info : dict
system and Python version information
"""
- pass
+ python = sys.version.replace("\n", " ")
+
+ blob = [
+ ("python", python),
+ ("executable", sys.executable),
+ ("machine", platform.platform()),
+ ]
+
+ return dict(blob)
def _get_C_info():
@@ -21,7 +29,67 @@ def _get_C_info():
c_info: dict
system PROJ information
"""
- pass
+ try:
+ import pyproj
+
+ proj_version = pyproj.proj_version_str
+ except Exception:
+ proj_version = None
+ try:
+ import pyproj
+
+ proj_dir = pyproj.datadir.get_data_dir()
+ except Exception:
+ proj_dir = None
+
+ try:
+ import shapely._buildcfg
+
+ geos_version = "{}.{}.{}".format(*shapely._buildcfg.geos_version)
+ geos_dir = shapely._buildcfg.geos_library_path
+ except Exception:
+ try:
+ from shapely import geos_version_string
+
+ geos_version = geos_version_string
+ geos_dir = None
+ except Exception:
+ geos_version = None
+ geos_dir = None
+
+ try:
+ import pyogrio
+
+ gdal_version = pyogrio.__gdal_version_string__
+ gdal_dir = pyogrio.get_gdal_data_path()
+ except Exception:
+ gdal_version = None
+ gdal_dir = None
+
+ if gdal_version is None:
+ try:
+ import fiona
+
+ gdal_version = fiona.env.get_gdal_release_name()
+ except Exception:
+ gdal_version = None
+ try:
+ import fiona
+
+ gdal_dir = fiona.env.GDALDataFinder().search()
+ except Exception:
+ gdal_dir = None
+
+ blob = [
+ ("GEOS", geos_version),
+ ("GEOS lib", geos_dir),
+ ("GDAL", gdal_version),
+ ("GDAL data dir", gdal_dir),
+ ("PROJ", proj_version),
+ ("PROJ data dir", proj_dir),
+ ]
+
+ return dict(blob)
def _get_deps_info():
@@ -32,7 +100,42 @@ def _get_deps_info():
deps_info: dict
version information on relevant Python libraries
"""
- pass
+ deps = [
+ "geopandas",
+ # required deps
+ "numpy",
+ "pandas",
+ "pyproj",
+ "shapely",
+ # optional deps
+ "pyogrio",
+ "geoalchemy2",
+ "geopy",
+ "matplotlib",
+ "mapclassify",
+ "fiona",
+ "psycopg",
+ "psycopg2",
+ "pyarrow",
+ ]
+
+ def get_version(module):
+ return module.__version__
+
+ deps_info = {}
+
+ for modname in deps:
+ try:
+ if modname in sys.modules:
+ mod = sys.modules[modname]
+ else:
+ mod = importlib.import_module(modname)
+ ver = get_version(mod)
+ deps_info[modname] = ver
+ except Exception:
+ deps_info[modname] = None
+
+ return deps_info
def show_versions():
@@ -46,4 +149,21 @@ def show_versions():
$ python -c "import geopandas; geopandas.show_versions()"
"""
- pass
+ sys_info = _get_sys_info()
+ deps_info = _get_deps_info()
+ proj_info = _get_C_info()
+
+ maxlen = max(len(x) for x in deps_info)
+ tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen)
+ print("\nSYSTEM INFO")
+ print("-----------")
+ for k, stat in sys_info.items():
+ print(tpl.format(k=k, stat=stat))
+ print("\nGEOS, GDAL, PROJ INFO")
+ print("---------------------")
+ for k, stat in proj_info.items():
+ print(tpl.format(k=k, stat=stat))
+ print("\nPYTHON DEPENDENCIES")
+ print("-------------------")
+ for k, stat in deps_info.items():
+ print(tpl.format(k=k, stat=stat))
diff --git a/geopandas/tools/clip.py b/geopandas/tools/clip.py
index 2ec2edaf..0382ff2d 100644
--- a/geopandas/tools/clip.py
+++ b/geopandas/tools/clip.py
@@ -5,14 +5,24 @@ geopandas.clip
A module to clip vector data using GeoPandas.
"""
+
import warnings
+
import numpy as np
import pandas.api.types
+
from shapely.geometry import MultiPolygon, Polygon, box
+
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import _check_crs, _crs_mismatch_warn
+def _mask_is_list_like_rectangle(mask):
+ return pandas.api.types.is_list_like(mask) and not isinstance(
+ mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon)
+ )
+
+
def _clip_gdf_with_mask(gdf, mask, sort=False):
"""Clip geometry to the polygon/rectangle extent.
@@ -37,7 +47,46 @@ def _clip_gdf_with_mask(gdf, mask, sort=False):
The returned GeoDataFrame is a clipped subset of gdf
that intersects with polygon/rectangle.
"""
- pass
+ clipping_by_rectangle = _mask_is_list_like_rectangle(mask)
+ if clipping_by_rectangle:
+ intersection_polygon = box(*mask)
+ else:
+ intersection_polygon = mask
+
+ gdf_sub = gdf.iloc[
+ gdf.sindex.query(intersection_polygon, predicate="intersects", sort=sort)
+ ]
+
+ # For performance reasons points don't need to be intersected with poly
+ non_point_mask = gdf_sub.geom_type != "Point"
+
+ if not non_point_mask.any():
+ # only points, directly return
+ return gdf_sub
+
+ # Clip the data with the polygon
+ if isinstance(gdf_sub, GeoDataFrame):
+ clipped = gdf_sub.copy()
+ if clipping_by_rectangle:
+ clipped.loc[non_point_mask, clipped._geometry_column_name] = (
+ gdf_sub.geometry.values[non_point_mask].clip_by_rect(*mask)
+ )
+ else:
+ clipped.loc[non_point_mask, clipped._geometry_column_name] = (
+ gdf_sub.geometry.values[non_point_mask].intersection(mask)
+ )
+ else:
+ # GeoSeries
+ clipped = gdf_sub.copy()
+ if clipping_by_rectangle:
+ clipped[non_point_mask] = gdf_sub.values[non_point_mask].clip_by_rect(*mask)
+ else:
+ clipped[non_point_mask] = gdf_sub.values[non_point_mask].intersection(mask)
+
+ if clipping_by_rectangle:
+ # clip_by_rect might return empty geometry collections in edge cases
+ clipped = clipped[~clipped.is_empty]
+ return clipped
def clip(gdf, mask, keep_geom_type=False, sort=False):
@@ -105,4 +154,104 @@ def clip(gdf, mask, keep_geom_type=False, sort=False):
>>> nws_groceries.shape
(7, 8)
"""
- pass
+ if not isinstance(gdf, (GeoDataFrame, GeoSeries)):
+ raise TypeError(
+ "'gdf' should be GeoDataFrame or GeoSeries, got {}".format(type(gdf))
+ )
+
+ mask_is_list_like = _mask_is_list_like_rectangle(mask)
+ if (
+ not isinstance(mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon))
+ and not mask_is_list_like
+ ):
+ raise TypeError(
+ "'mask' should be GeoDataFrame, GeoSeries,"
+ f"(Multi)Polygon or list-like, got {type(mask)}"
+ )
+
+ if mask_is_list_like and len(mask) != 4:
+ raise TypeError(
+ "If 'mask' is list-like, it must have four values (minx, miny, maxx, maxy)"
+ )
+
+ if isinstance(mask, (GeoDataFrame, GeoSeries)):
+ if not _check_crs(gdf, mask):
+ _crs_mismatch_warn(gdf, mask, stacklevel=3)
+
+ if isinstance(mask, (GeoDataFrame, GeoSeries)):
+ box_mask = mask.total_bounds
+ elif mask_is_list_like:
+ box_mask = mask
+ else:
+ # Avoid empty tuple returned by .bounds when geometry is empty. A tuple of
+ # all nan values is consistent with the behavior of
+ # {GeoSeries, GeoDataFrame}.total_bounds for empty geometries.
+ # TODO(shapely) can simpely use mask.bounds once relying on Shapely 2.0
+ box_mask = mask.bounds if not mask.is_empty else (np.nan,) * 4
+ box_gdf = gdf.total_bounds
+ if not (
+ ((box_mask[0] <= box_gdf[2]) and (box_gdf[0] <= box_mask[2]))
+ and ((box_mask[1] <= box_gdf[3]) and (box_gdf[1] <= box_mask[3]))
+ ):
+ return gdf.iloc[:0]
+
+ if isinstance(mask, (GeoDataFrame, GeoSeries)):
+ combined_mask = mask.geometry.union_all()
+ else:
+ combined_mask = mask
+
+ clipped = _clip_gdf_with_mask(gdf, combined_mask, sort=sort)
+
+ if keep_geom_type:
+ geomcoll_concat = (clipped.geom_type == "GeometryCollection").any()
+ geomcoll_orig = (gdf.geom_type == "GeometryCollection").any()
+
+ new_collection = geomcoll_concat and not geomcoll_orig
+
+ if geomcoll_orig:
+ warnings.warn(
+ "keep_geom_type can not be called on a "
+ "GeoDataFrame with GeometryCollection.",
+ stacklevel=2,
+ )
+ else:
+ polys = ["Polygon", "MultiPolygon"]
+ lines = ["LineString", "MultiLineString", "LinearRing"]
+ points = ["Point", "MultiPoint"]
+
+ # Check that the gdf for multiple geom types (points, lines and/or polys)
+ orig_types_total = sum(
+ [
+ gdf.geom_type.isin(polys).any(),
+ gdf.geom_type.isin(lines).any(),
+ gdf.geom_type.isin(points).any(),
+ ]
+ )
+
+ # Check how many geometry types are in the clipped GeoDataFrame
+ clip_types_total = sum(
+ [
+ clipped.geom_type.isin(polys).any(),
+ clipped.geom_type.isin(lines).any(),
+ clipped.geom_type.isin(points).any(),
+ ]
+ )
+
+ # Check there aren't any new geom types in the clipped GeoDataFrame
+ more_types = orig_types_total < clip_types_total
+
+ if orig_types_total > 1:
+ warnings.warn(
+ "keep_geom_type can not be called on a mixed type GeoDataFrame.",
+ stacklevel=2,
+ )
+ elif new_collection or more_types:
+ orig_type = gdf.geom_type.iloc[0]
+ if new_collection:
+ clipped = clipped.explode(index_parts=False)
+ if orig_type in polys:
+ clipped = clipped.loc[clipped.geom_type.isin(polys)]
+ elif orig_type in lines:
+ clipped = clipped.loc[clipped.geom_type.isin(lines)]
+
+ return clipped
diff --git a/geopandas/tools/geocoding.py b/geopandas/tools/geocoding.py
index 995d1d88..d1b9aaa6 100644
--- a/geopandas/tools/geocoding.py
+++ b/geopandas/tools/geocoding.py
@@ -1,7 +1,10 @@
import time
from collections import defaultdict
+
import pandas as pd
+
from shapely.geometry import Point
+
import geopandas
@@ -10,7 +13,13 @@ def _get_throttle_time(provider):
Amount of time to wait between requests to a geocoding API, for providers
that specify rate limits in their terms of service.
"""
- pass
+ import geopy.geocoders
+
+ # https://operations.osmfoundation.org/policies/nominatim/
+ if provider == geopy.geocoders.Nominatim:
+ return 1
+ else:
+ return 0
def geocode(strings, provider=None, **kwargs):
@@ -50,7 +59,12 @@ def geocode(strings, provider=None, **kwargs):
0 POINT (-71.05863 42.35899) Boston, MA, United States
1 POINT (-77.03651 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
"""
- pass
+
+ if provider is None:
+ provider = "photon"
+ throttle_time = _get_throttle_time(provider)
+
+ return _query(strings, True, provider, throttle_time, **kwargs)
def reverse_geocode(points, provider=None, **kwargs):
@@ -96,7 +110,43 @@ def reverse_geocode(points, provider=None, **kwargs):
0 POINT (-71.05941 42.35837) 29 Court Sq, Boston, MA 02108, United States
1 POINT (-77.03641 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
"""
- pass
+
+ if provider is None:
+ provider = "photon"
+ throttle_time = _get_throttle_time(provider)
+
+ return _query(points, False, provider, throttle_time, **kwargs)
+
+
+def _query(data, forward, provider, throttle_time, **kwargs):
+ # generic wrapper for calls over lists to geopy Geocoders
+ from geopy.geocoders import get_geocoder_for_service
+ from geopy.geocoders.base import GeocoderQueryError
+
+ if forward:
+ if not isinstance(data, pd.Series):
+ data = pd.Series(data)
+ else:
+ if not isinstance(data, geopandas.GeoSeries):
+ data = geopandas.GeoSeries(data)
+
+ if isinstance(provider, str):
+ provider = get_geocoder_for_service(provider)
+
+ coder = provider(**kwargs)
+ results = {}
+ for i, s in data.items():
+ try:
+ if forward:
+ results[i] = coder.geocode(s)
+ else:
+ results[i] = coder.reverse((s.y, s.x), exactly_one=True)
+ except (GeocoderQueryError, ValueError):
+ results[i] = (None, None)
+ time.sleep(throttle_time)
+
+ df = _prepare_geocode_result(results)
+ return df
def _prepare_geocode_result(results):
@@ -107,4 +157,28 @@ def _prepare_geocode_result(results):
(address, (lat, lon))
"""
- pass
+ # Prepare the data for the DataFrame as a dict of lists
+ d = defaultdict(list)
+ index = []
+
+ for i, s in results.items():
+ if s is None:
+ p = Point()
+ address = None
+
+ else:
+ address, loc = s
+
+ # loc is lat, lon and we want lon, lat
+ if loc is None:
+ p = Point()
+ else:
+ p = Point(loc[1], loc[0])
+
+ d["geometry"].append(p)
+ d["address"].append(address)
+ index.append(i)
+
+ df = geopandas.GeoDataFrame(d, index=index, crs="EPSG:4326")
+
+ return df
diff --git a/geopandas/tools/hilbert_curve.py b/geopandas/tools/hilbert_curve.py
index 7315816a..4d42abac 100644
--- a/geopandas/tools/hilbert_curve.py
+++ b/geopandas/tools/hilbert_curve.py
@@ -23,7 +23,20 @@ def _hilbert_distance(geoms, total_bounds=None, level=16):
Array containing distances along the Hilbert curve
"""
- pass
+ if geoms.is_empty.any() | geoms.isna().any():
+ raise ValueError(
+ "Hilbert distance cannot be computed on a GeoSeries with empty or "
+ "missing geometries.",
+ )
+ # Calculate bounds as numpy array
+ bounds = geoms.bounds
+
+ # Calculate discrete coords based on total bounds and bounds
+ x, y = _continuous_to_discrete_coords(bounds, level, total_bounds)
+ # Compute distance along hilbert curve
+ distances = _encode(level, x, y)
+
+ return distances
def _continuous_to_discrete_coords(bounds, level, total_bounds):
@@ -46,7 +59,29 @@ def _continuous_to_discrete_coords(bounds, level, total_bounds):
Two-dimensional array Array of hilbert distances for each geom
"""
- pass
+ # Hilbert Side length
+ side_length = (2**level) - 1
+
+ # Calculate mid points for x and y bound coords - returns array
+ x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0
+ y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0
+
+ # Calculate x and y range of total bound coords - returns array
+ if total_bounds is None:
+ total_bounds = (
+ np.nanmin(x_mids),
+ np.nanmin(y_mids),
+ np.nanmax(x_mids),
+ np.nanmax(y_mids),
+ )
+
+ xmin, ymin, xmax, ymax = total_bounds
+
+ # Transform continuous value to discrete integer for each dimension
+ x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length)
+ y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length)
+
+ return x_int, y_int
def _continuous_to_discrete(vals, val_range, n):
@@ -67,7 +102,87 @@ def _continuous_to_discrete(vals, val_range, n):
One-dimensional array of discrete ints
"""
- pass
+ width = val_range[1] - val_range[0]
+ if width == 0:
+ return np.zeros_like(vals, dtype=np.uint32)
+ res = (vals - val_range[0]) * (n / width)
+
+ np.clip(res, 0, n, out=res)
+ return res.astype(np.uint32)
+
+
+# Fast Hilbert curve algorithm by http://threadlocalmutex.com/
+# From C++ https://github.com/rawrunprotected/hilbert_curves
+# (public domain)
MAX_LEVEL = 16
+
+
+def _interleave(x):
+ x = (x | (x << 8)) & 0x00FF00FF
+ x = (x | (x << 4)) & 0x0F0F0F0F
+ x = (x | (x << 2)) & 0x33333333
+ x = (x | (x << 1)) & 0x55555555
+ return x
+
+
+def _encode(level, x, y):
+ x = np.asarray(x, dtype="uint32")
+ y = np.asarray(y, dtype="uint32")
+
+ if level > MAX_LEVEL:
+ raise ValueError("Level out of range")
+
+ x = x << (16 - level)
+ y = y << (16 - level)
+
+ # Initial prefix scan round, prime with x and y
+ a = x ^ y
+ b = 0xFFFF ^ a
+ c = 0xFFFF ^ (x | y)
+ d = x & (y ^ 0xFFFF)
+
+ A = a | (b >> 1)
+ B = (a >> 1) ^ a
+ C = ((c >> 1) ^ (b & (d >> 1))) ^ c
+ D = ((a & (c >> 1)) ^ (d >> 1)) ^ d
+
+ a = A.copy()
+ b = B.copy()
+ c = C.copy()
+ d = D.copy()
+
+ A = (a & (a >> 2)) ^ (b & (b >> 2))
+ B = (a & (b >> 2)) ^ (b & ((a ^ b) >> 2))
+ C ^= (a & (c >> 2)) ^ (b & (d >> 2))
+ D ^= (b & (c >> 2)) ^ ((a ^ b) & (d >> 2))
+
+ a = A.copy()
+ b = B.copy()
+ c = C.copy()
+ d = D.copy()
+
+ A = (a & (a >> 4)) ^ (b & (b >> 4))
+ B = (a & (b >> 4)) ^ (b & ((a ^ b) >> 4))
+ C ^= (a & (c >> 4)) ^ (b & (d >> 4))
+ D ^= (b & (c >> 4)) ^ ((a ^ b) & (d >> 4))
+
+ # Final round and projection
+ a = A.copy()
+ b = B.copy()
+ c = C.copy()
+ d = D.copy()
+
+ C ^= (a & (c >> 8)) ^ (b & (d >> 8))
+ D ^= (b & (c >> 8)) ^ ((a ^ b) & (d >> 8))
+
+ # Undo transformation prefix scan
+ a = C ^ (C >> 1)
+ b = D ^ (D >> 1)
+
+ # Recover index bits
+ i0 = x ^ y
+ i1 = b | (0xFFFF ^ (i0 | a))
+
+ return ((_interleave(i1) << 1) | _interleave(i0)) >> (32 - 2 * level)
diff --git a/geopandas/tools/overlay.py b/geopandas/tools/overlay.py
index efb6afbd..06e60f7c 100644
--- a/geopandas/tools/overlay.py
+++ b/geopandas/tools/overlay.py
@@ -1,7 +1,9 @@
import warnings
from functools import reduce
+
import numpy as np
import pandas as pd
+
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
@@ -12,39 +14,140 @@ def _ensure_geometry_column(df):
Helper function to ensure the geometry column is called 'geometry'.
If another column with that name exists, it will be dropped.
"""
- pass
+ if not df._geometry_column_name == "geometry":
+ if PANDAS_GE_30:
+ if "geometry" in df.columns:
+ df = df.drop("geometry", axis=1)
+ df = df.rename_geometry("geometry")
+ else:
+ if "geometry" in df.columns:
+ df.drop("geometry", axis=1, inplace=True)
+ df.rename_geometry("geometry", inplace=True)
+ return df
def _overlay_intersection(df1, df2):
"""
Overlay Intersection operation used in overlay function
"""
- pass
+ # Spatial Index to create intersections
+ idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
+ # Create pairs of geometries in both dataframes to be intersected
+ if idx1.size > 0 and idx2.size > 0:
+ left = df1.geometry.take(idx1)
+ left.reset_index(drop=True, inplace=True)
+ right = df2.geometry.take(idx2)
+ right.reset_index(drop=True, inplace=True)
+ intersections = left.intersection(right)
+ poly_ix = intersections.geom_type.isin(["Polygon", "MultiPolygon"])
+ intersections.loc[poly_ix] = intersections[poly_ix].make_valid()
+
+ # only keep actual intersecting geometries
+ pairs_intersect = pd.DataFrame({"__idx1": idx1, "__idx2": idx2})
+ geom_intersect = intersections
+
+ # merge data for intersecting geometries
+ df1 = df1.reset_index(drop=True)
+ df2 = df2.reset_index(drop=True)
+ dfinter = pairs_intersect.merge(
+ df1.drop(df1._geometry_column_name, axis=1),
+ left_on="__idx1",
+ right_index=True,
+ )
+ dfinter = dfinter.merge(
+ df2.drop(df2._geometry_column_name, axis=1),
+ left_on="__idx2",
+ right_index=True,
+ suffixes=("_1", "_2"),
+ )
+
+ return GeoDataFrame(dfinter, geometry=geom_intersect, crs=df1.crs)
+ else:
+ result = df1.iloc[:0].merge(
+ df2.iloc[:0].drop(df2.geometry.name, axis=1),
+ left_index=True,
+ right_index=True,
+ suffixes=("_1", "_2"),
+ )
+ result["__idx1"] = np.nan
+ result["__idx2"] = np.nan
+ return result[
+ result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
+ ]
def _overlay_difference(df1, df2):
"""
Overlay Difference operation used in overlay function
"""
- pass
+ # spatial index query to find intersections
+ idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
+ idx1_unique, idx1_unique_indices = np.unique(idx1, return_index=True)
+ idx2_split = np.split(idx2, idx1_unique_indices[1:])
+ sidx = [
+ idx2_split.pop(0) if idx in idx1_unique else []
+ for idx in range(df1.geometry.size)
+ ]
+ # Create differences
+ new_g = []
+ for geom, neighbours in zip(df1.geometry, sidx):
+ new = reduce(
+ lambda x, y: x.difference(y), [geom] + list(df2.geometry.iloc[neighbours])
+ )
+ new_g.append(new)
+ differences = GeoSeries(new_g, index=df1.index, crs=df1.crs)
+ poly_ix = differences.geom_type.isin(["Polygon", "MultiPolygon"])
+ differences.loc[poly_ix] = differences[poly_ix].make_valid()
+ geom_diff = differences[~differences.is_empty].copy()
+ dfdiff = df1[~differences.is_empty].copy()
+ dfdiff[dfdiff._geometry_column_name] = geom_diff
+ return dfdiff
def _overlay_symmetric_diff(df1, df2):
"""
Overlay Symmetric Difference operation used in overlay function
"""
- pass
+ dfdiff1 = _overlay_difference(df1, df2)
+ dfdiff2 = _overlay_difference(df2, df1)
+ dfdiff1["__idx1"] = range(len(dfdiff1))
+ dfdiff2["__idx2"] = range(len(dfdiff2))
+ dfdiff1["__idx2"] = np.nan
+ dfdiff2["__idx1"] = np.nan
+ # ensure geometry name (otherwise merge goes wrong)
+ dfdiff1 = _ensure_geometry_column(dfdiff1)
+ dfdiff2 = _ensure_geometry_column(dfdiff2)
+ # combine both 'difference' dataframes
+ dfsym = dfdiff1.merge(
+ dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
+ )
+ geometry = dfsym.geometry_1.copy()
+ geometry.name = "geometry"
+ # https://github.com/pandas-dev/pandas/issues/26468 use loc for now
+ geometry.loc[dfsym.geometry_1.isnull()] = dfsym.loc[
+ dfsym.geometry_1.isnull(), "geometry_2"
+ ]
+ dfsym.drop(["geometry_1", "geometry_2"], axis=1, inplace=True)
+ dfsym.reset_index(drop=True, inplace=True)
+ dfsym = GeoDataFrame(dfsym, geometry=geometry, crs=df1.crs)
+ return dfsym
def _overlay_union(df1, df2):
"""
Overlay Union operation used in overlay function
"""
- pass
+ dfinter = _overlay_intersection(df1, df2)
+ dfsym = _overlay_symmetric_diff(df1, df2)
+ dfunion = pd.concat([dfinter, dfsym], ignore_index=True, sort=False)
+ # keep geometry column last
+ columns = list(dfunion.columns)
+ columns.remove("geometry")
+ columns.append("geometry")
+ return dfunion.reindex(columns=columns)
-def overlay(df1, df2, how='intersection', keep_geom_type=None, make_valid=True
- ):
+def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
"""Perform spatial overlay between two GeoDataFrames.
Currently only supports data GeoDataFrames with uniform geometry types,
@@ -132,4 +235,165 @@ def overlay(df1, df2, how='intersection', keep_geom_type=None, make_valid=True
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+ # Allowed operations
+ allowed_hows = [
+ "intersection",
+ "union",
+ "identity",
+ "symmetric_difference",
+ "difference", # aka erase
+ ]
+ # Error Messages
+ if how not in allowed_hows:
+ raise ValueError(
+ "`how` was '{0}' but is expected to be in {1}".format(how, allowed_hows)
+ )
+
+ if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries):
+ raise NotImplementedError(
+ "overlay currently only implemented for GeoDataFrames"
+ )
+
+ if not _check_crs(df1, df2):
+ _crs_mismatch_warn(df1, df2, stacklevel=3)
+
+ if keep_geom_type is None:
+ keep_geom_type = True
+ keep_geom_type_warning = True
+ else:
+ keep_geom_type_warning = False
+
+ polys = ["Polygon", "MultiPolygon"]
+ lines = ["LineString", "MultiLineString", "LinearRing"]
+ points = ["Point", "MultiPoint"]
+ for i, df in enumerate([df1, df2]):
+ poly_check = df.geom_type.isin(polys).any()
+ lines_check = df.geom_type.isin(lines).any()
+ points_check = df.geom_type.isin(points).any()
+ if sum([poly_check, lines_check, points_check]) > 1:
+ raise NotImplementedError(
+ "df{} contains mixed geometry types.".format(i + 1)
+ )
+
+ if how == "intersection":
+ box_gdf1 = df1.total_bounds
+ box_gdf2 = df2.total_bounds
+
+ if not (
+ ((box_gdf1[0] <= box_gdf2[2]) and (box_gdf2[0] <= box_gdf1[2]))
+ and ((box_gdf1[1] <= box_gdf2[3]) and (box_gdf2[1] <= box_gdf1[3]))
+ ):
+ result = df1.iloc[:0].merge(
+ df2.iloc[:0].drop(df2.geometry.name, axis=1),
+ left_index=True,
+ right_index=True,
+ suffixes=("_1", "_2"),
+ )
+ return result[
+ result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
+ ]
+
+ # Computations
+ def _make_valid(df):
+ df = df.copy()
+ if df.geom_type.isin(polys).all():
+ mask = ~df.geometry.is_valid
+ col = df._geometry_column_name
+ if make_valid:
+ df.loc[mask, col] = df.loc[mask, col].make_valid()
+ elif mask.any():
+ raise ValueError(
+ "You have passed make_valid=False along with "
+ f"{mask.sum()} invalid input geometries. "
+ "Use make_valid=True or make sure that all geometries "
+ "are valid before using overlay."
+ )
+ return df
+
+ df1 = _make_valid(df1)
+ df2 = _make_valid(df2)
+
+ with warnings.catch_warnings(): # CRS checked above, suppress array-level warning
+ warnings.filterwarnings("ignore", message="CRS mismatch between the CRS")
+ if how == "difference":
+ result = _overlay_difference(df1, df2)
+ elif how == "intersection":
+ result = _overlay_intersection(df1, df2)
+ elif how == "symmetric_difference":
+ result = _overlay_symmetric_diff(df1, df2)
+ elif how == "union":
+ result = _overlay_union(df1, df2)
+ elif how == "identity":
+ dfunion = _overlay_union(df1, df2)
+ result = dfunion[dfunion["__idx1"].notnull()].copy()
+
+ if how in ["intersection", "symmetric_difference", "union", "identity"]:
+ result.drop(["__idx1", "__idx2"], axis=1, inplace=True)
+
+ if keep_geom_type:
+ geom_type = df1.geom_type.iloc[0]
+
+ # First we filter the geometry types inside GeometryCollections objects
+ # (e.g. GeometryCollection([polygon, point]) -> polygon)
+ # we do this separately on only the relevant rows, as this is an expensive
+ # operation (an expensive no-op for geometry types other than collections)
+ is_collection = result.geom_type == "GeometryCollection"
+ if is_collection.any():
+ geom_col = result._geometry_column_name
+ collections = result[[geom_col]][is_collection]
+
+ exploded = collections.reset_index(drop=True).explode(index_parts=True)
+ exploded = exploded.reset_index(level=0)
+
+ orig_num_geoms_exploded = exploded.shape[0]
+ if geom_type in polys:
+ exploded.loc[~exploded.geom_type.isin(polys), geom_col] = None
+ elif geom_type in lines:
+ exploded.loc[~exploded.geom_type.isin(lines), geom_col] = None
+ elif geom_type in points:
+ exploded.loc[~exploded.geom_type.isin(points), geom_col] = None
+ else:
+ raise TypeError(
+ "`keep_geom_type` does not support {}.".format(geom_type)
+ )
+ num_dropped_collection = (
+ orig_num_geoms_exploded - exploded.geometry.isna().sum()
+ )
+
+ # level_0 created with above reset_index operation
+ # and represents the original geometry collections
+ # TODO avoiding dissolve to call union_all in this case could further
+ # improve performance (we only need to collect geometries in their
+ # respective Multi version)
+ dissolved = exploded.dissolve(by="level_0")
+ result.loc[is_collection, geom_col] = dissolved[geom_col].values
+ else:
+ num_dropped_collection = 0
+
+ # Now we filter all geometries (in theory we don't need to do this
+ # again for the rows handled above for GeometryCollections, but filtering
+ # them out is probably more expensive as simply including them when this
+ # is typically about only a few rows)
+ orig_num_geoms = result.shape[0]
+ if geom_type in polys:
+ result = result.loc[result.geom_type.isin(polys)]
+ elif geom_type in lines:
+ result = result.loc[result.geom_type.isin(lines)]
+ elif geom_type in points:
+ result = result.loc[result.geom_type.isin(points)]
+ else:
+ raise TypeError("`keep_geom_type` does not support {}.".format(geom_type))
+ num_dropped = orig_num_geoms - result.shape[0]
+
+ if (num_dropped > 0 or num_dropped_collection > 0) and keep_geom_type_warning:
+ warnings.warn(
+ "`keep_geom_type=True` in overlay resulted in {} dropped "
+ "geometries of different geometry types than df1 has. "
+ "Set `keep_geom_type=False` to retain all "
+ "geometries".format(num_dropped + num_dropped_collection),
+ UserWarning,
+ stacklevel=2,
+ )
+
+ result.reset_index(drop=True, inplace=True)
+ return result
diff --git a/geopandas/tools/sjoin.py b/geopandas/tools/sjoin.py
index 04fc98c9..06d7ef74 100644
--- a/geopandas/tools/sjoin.py
+++ b/geopandas/tools/sjoin.py
@@ -1,15 +1,26 @@
import warnings
from functools import partial
from typing import Optional
+
import numpy as np
import pandas as pd
+
from geopandas import GeoDataFrame
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
-def sjoin(left_df, right_df, how='inner', predicate='intersects', lsuffix=
- 'left', rsuffix='right', distance=None, on_attribute=None, **kwargs):
+def sjoin(
+ left_df,
+ right_df,
+ how="inner",
+ predicate="intersects",
+ lsuffix="left",
+ rsuffix="right",
+ distance=None,
+ on_attribute=None,
+ **kwargs,
+):
"""Spatial join of two GeoDataFrames.
See the User Guide page :doc:`../../user_guide/mergingdata` for details.
@@ -94,7 +105,39 @@ def sjoin(left_df, right_df, how='inner', predicate='intersects', lsuffix=
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+ if kwargs:
+ first = next(iter(kwargs.keys()))
+ raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
+
+ on_attribute = _maybe_make_list(on_attribute)
+
+ _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=on_attribute),
+
+ indices = _geom_predicate_query(
+ left_df, right_df, predicate, distance, on_attribute=on_attribute
+ )
+
+ joined, _ = _frame_join(
+ left_df,
+ right_df,
+ indices,
+ None,
+ how,
+ lsuffix,
+ rsuffix,
+ predicate,
+ on_attribute=on_attribute,
+ )
+
+ return joined
+
+
+def _maybe_make_list(obj):
+ if isinstance(obj, tuple):
+ return list(obj)
+ if obj is not None and not isinstance(obj, list):
+ return [obj]
+ return obj
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=None):
@@ -117,11 +160,47 @@ def _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=None):
on_attribute : list, default None
list of column names to merge on along with geometry
"""
- pass
-
-
-def _geom_predicate_query(left_df, right_df, predicate, distance,
- on_attribute=None):
+ if not isinstance(left_df, GeoDataFrame):
+ raise ValueError(
+ "'left_df' should be GeoDataFrame, got {}".format(type(left_df))
+ )
+
+ if not isinstance(right_df, GeoDataFrame):
+ raise ValueError(
+ "'right_df' should be GeoDataFrame, got {}".format(type(right_df))
+ )
+
+ allowed_hows = ["left", "right", "inner"]
+ if how not in allowed_hows:
+ raise ValueError(
+ '`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
+ )
+
+ if not _check_crs(left_df, right_df):
+ _crs_mismatch_warn(left_df, right_df, stacklevel=4)
+
+ if on_attribute:
+ for attr in on_attribute:
+ if (attr not in left_df) and (attr not in right_df):
+ raise ValueError(
+ f"Expected column {attr} is missing from both of the dataframes."
+ )
+ if attr not in left_df:
+ raise ValueError(
+ f"Expected column {attr} is missing from the left dataframe."
+ )
+ if attr not in right_df:
+ raise ValueError(
+ f"Expected column {attr} is missing from the right dataframe."
+ )
+ if attr in (left_df.geometry.name, right_df.geometry.name):
+ raise ValueError(
+ "Active geometry column cannot be used as an input "
+ "for on_attribute parameter."
+ )
+
+
+def _geom_predicate_query(left_df, right_df, predicate, distance, on_attribute=None):
"""Compute geometric comparisons and get matching indices.
Parameters
@@ -140,7 +219,45 @@ def _geom_predicate_query(left_df, right_df, predicate, distance,
DataFrame with matching indices in
columns named `_key_left` and `_key_right`.
"""
- pass
+
+ original_predicate = predicate
+
+ if predicate == "within":
+ # within is implemented as the inverse of contains
+ # contains is a faster predicate
+ # see discussion at https://github.com/geopandas/geopandas/pull/1421
+ predicate = "contains"
+ sindex = left_df.sindex
+ input_geoms = right_df.geometry
+ else:
+ # all other predicates are symmetric
+ # keep them the same
+ sindex = right_df.sindex
+ input_geoms = left_df.geometry
+
+ if sindex:
+ l_idx, r_idx = sindex.query(
+ input_geoms, predicate=predicate, sort=False, distance=distance
+ )
+ else:
+ # when sindex is empty / has no valid geometries
+ l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
+
+ if original_predicate == "within":
+ # within is implemented as the inverse of contains
+ # flip back the results
+ r_idx, l_idx = l_idx, r_idx
+ indexer = np.lexsort((r_idx, l_idx))
+ l_idx = l_idx[indexer]
+ r_idx = r_idx[indexer]
+
+ if on_attribute:
+ for attr in on_attribute:
+ (l_idx, r_idx), _ = _filter_shared_attribute(
+ left_df, right_df, l_idx, r_idx, attr
+ )
+
+ return l_idx, r_idx
def _reset_index_with_suffix(df, suffix, other):
@@ -148,18 +265,91 @@ def _reset_index_with_suffix(df, suffix, other):
Equivalent of df.reset_index(), but with adding 'suffix' to auto-generated
column names.
"""
- pass
-
-
-def _process_column_names_with_suffix(left: pd.Index, right: pd.Index,
- suffixes, left_df, right_df):
+ index_original = df.index.names
+ if PANDAS_GE_30:
+ df_reset = df.reset_index()
+ else:
+ # we already made a copy of the dataframe in _frame_join before getting here
+ df_reset = df
+ df_reset.reset_index(inplace=True)
+ column_names = df_reset.columns.to_numpy(copy=True)
+ for i, label in enumerate(index_original):
+ # if the original label was None, add suffix to auto-generated name
+ if label is None:
+ new_label = column_names[i]
+ if "level" in new_label:
+ # reset_index of MultiIndex gives "level_i" names, preserve the "i"
+ lev = new_label.split("_")[1]
+ new_label = f"index_{suffix}{lev}"
+ else:
+ new_label = f"index_{suffix}"
+ # check new label will not be in other dataframe
+ if new_label in df.columns or new_label in other.columns:
+ raise ValueError(
+ "'{0}' cannot be a column name in the frames being"
+ " joined".format(new_label)
+ )
+ column_names[i] = new_label
+ return df_reset, pd.Index(column_names)
+
+
+def _process_column_names_with_suffix(
+ left: pd.Index, right: pd.Index, suffixes, left_df, right_df
+):
"""
Add suffixes to overlapping labels (ignoring the geometry column).
This is based on pandas' merge logic at https://github.com/pandas-dev/pandas/blob/
a0779adb183345a8eb4be58b3ad00c223da58768/pandas/core/reshape/merge.py#L2300-L2370
"""
- pass
+ to_rename = left.intersection(right)
+ if len(to_rename) == 0:
+ return left, right
+
+ lsuffix, rsuffix = suffixes
+
+ if not lsuffix and not rsuffix:
+ raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
+
+ def renamer(x, suffix, geometry):
+ if x in to_rename and x != geometry and suffix is not None:
+ return f"{x}_{suffix}"
+ return x
+
+ lrenamer = partial(
+ renamer,
+ suffix=lsuffix,
+ geometry=getattr(left_df, "_geometry_column_name", None),
+ )
+ rrenamer = partial(
+ renamer,
+ suffix=rsuffix,
+ geometry=getattr(right_df, "_geometry_column_name", None),
+ )
+
+ # TODO retain index name?
+ left_renamed = pd.Index([lrenamer(lab) for lab in left])
+ right_renamed = pd.Index([rrenamer(lab) for lab in right])
+
+ dups = []
+ if not left_renamed.is_unique:
+ # Only warn when duplicates are caused because of suffixes, already duplicated
+ # columns in origin should not warn
+ dups = left_renamed[(left_renamed.duplicated()) & (~left.duplicated())].tolist()
+ if not right_renamed.is_unique:
+ dups.extend(
+ right_renamed[(right_renamed.duplicated()) & (~right.duplicated())].tolist()
+ )
+ # TODO turn this into an error (pandas has done so as well)
+ if dups:
+ warnings.warn(
+ f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
+ f"result is deprecated and will raise a MergeError in a future version.",
+ FutureWarning,
+ stacklevel=4,
+ )
+
+ return left_renamed, right_renamed
def _restore_index(joined, index_names, index_names_original):
@@ -167,7 +357,18 @@ def _restore_index(joined, index_names, index_names_original):
Set back the the original index columns, and restoring their name as `None`
if they didn't have a name originally.
"""
- pass
+ if PANDAS_GE_30:
+ joined = joined.set_index(list(index_names))
+ else:
+ joined.set_index(list(index_names), inplace=True)
+
+ # restore the fact that the index didn't have a name
+ joined_index_names = list(joined.index.names)
+ for i, label in enumerate(index_names_original):
+ if label is None:
+ joined_index_names[i] = None
+ joined.index.names = joined_index_names
+ return joined
def _adjust_indexers(indices, distances, original_length, how, predicate):
@@ -176,11 +377,52 @@ def _adjust_indexers(indices, distances, original_length, how, predicate):
For a left or right join, we need to adjust them to include the rows
that would not be present in an inner join.
"""
- pass
-
-
-def _frame_join(left_df, right_df, indices, distances, how, lsuffix,
- rsuffix, predicate, on_attribute=None):
+ # the indices represent an inner join, no adjustment needed
+ if how == "inner":
+ return indices, distances
+
+ l_idx, r_idx = indices
+
+ if how == "right":
+ # re-sort so it is sorted by the right indexer
+ indexer = np.lexsort((l_idx, r_idx))
+ l_idx, r_idx = l_idx[indexer], r_idx[indexer]
+ if distances is not None:
+ distances = distances[indexer]
+
+ # switch order
+ r_idx, l_idx = l_idx, r_idx
+
+ # determine which indices are missing and where they would need to be inserted
+ idx = np.arange(original_length)
+ l_idx_missing = idx[~np.isin(idx, l_idx)]
+ insert_idx = np.searchsorted(l_idx, l_idx_missing)
+ # for the left indexer, insert those missing indices
+ l_idx = np.insert(l_idx, insert_idx, l_idx_missing)
+ # for the right indexer, insert -1 -> to get missing values in pandas' reindexing
+ r_idx = np.insert(r_idx, insert_idx, -1)
+ # for the indices, already insert those missing values manually
+ if distances is not None:
+ distances = np.insert(distances, insert_idx, np.nan)
+
+ if how == "right":
+ # switch back
+ l_idx, r_idx = r_idx, l_idx
+
+ return (l_idx, r_idx), distances
+
+
+def _frame_join(
+ left_df,
+ right_df,
+ indices,
+ distances,
+ how,
+ lsuffix,
+ rsuffix,
+ predicate,
+ on_attribute=None,
+):
"""Join the GeoDataFrames at the DataFrame level.
Parameters
@@ -208,7 +450,117 @@ def _frame_join(left_df, right_df, indices, distances, how, lsuffix,
GeoDataFrame
Joined GeoDataFrame.
"""
- pass
+ if on_attribute: # avoid renaming or duplicating shared column
+ right_df = right_df.drop(on_attribute, axis=1)
+
+ if how in ("inner", "left"):
+ right_df = right_df.drop(right_df.geometry.name, axis=1)
+ else: # how == 'right':
+ left_df = left_df.drop(left_df.geometry.name, axis=1)
+
+ left_df = left_df.copy(deep=False)
+ left_nlevels = left_df.index.nlevels
+ left_index_original = left_df.index.names
+ left_df, left_column_names = _reset_index_with_suffix(left_df, lsuffix, right_df)
+
+ right_df = right_df.copy(deep=False)
+ right_nlevels = right_df.index.nlevels
+ right_index_original = right_df.index.names
+ right_df, right_column_names = _reset_index_with_suffix(right_df, rsuffix, left_df)
+
+ # if conflicting names in left and right, add suffix
+ left_column_names, right_column_names = _process_column_names_with_suffix(
+ left_column_names,
+ right_column_names,
+ (lsuffix, rsuffix),
+ left_df,
+ right_df,
+ )
+ left_df.columns = left_column_names
+ right_df.columns = right_column_names
+ left_index = left_df.columns[:left_nlevels]
+ right_index = right_df.columns[:right_nlevels]
+
+ # perform join on the dataframes
+ original_length = len(right_df) if how == "right" else len(left_df)
+ (l_idx, r_idx), distances = _adjust_indexers(
+ indices, distances, original_length, how, predicate
+ )
+ # the `take` method doesn't allow introducing NaNs with -1 indices
+ # left = left_df.take(l_idx)
+ # therefore we are using the private _reindex_with_indexers as workaround
+ new_index = pd.RangeIndex(len(l_idx))
+ left = left_df._reindex_with_indexers({0: (new_index, l_idx)})
+ right = right_df._reindex_with_indexers({0: (new_index, r_idx)})
+ if PANDAS_GE_30:
+ kwargs = {}
+ else:
+ kwargs = dict(copy=False)
+ joined = pd.concat([left, right], axis=1, **kwargs)
+
+ if how in ("inner", "left"):
+ joined = _restore_index(joined, left_index, left_index_original)
+ else: # how == 'right':
+ joined = joined.set_geometry(right_df.geometry.name)
+ joined = _restore_index(joined, right_index, right_index_original)
+
+ return joined, distances
+
+
+def _nearest_query(
+ left_df: GeoDataFrame,
+ right_df: GeoDataFrame,
+ max_distance: float,
+ how: str,
+ return_distance: bool,
+ exclusive: bool,
+ on_attribute: Optional[list] = None,
+):
+ # use the opposite of the join direction for the index
+ use_left_as_sindex = how == "right"
+ if use_left_as_sindex:
+ sindex = left_df.sindex
+ query = right_df.geometry
+ else:
+ sindex = right_df.sindex
+ query = left_df.geometry
+ if sindex:
+ res = sindex.nearest(
+ query,
+ return_all=True,
+ max_distance=max_distance,
+ return_distance=return_distance,
+ exclusive=exclusive,
+ )
+ if return_distance:
+ (input_idx, tree_idx), distances = res
+ else:
+ (input_idx, tree_idx) = res
+ distances = None
+ if use_left_as_sindex:
+ l_idx, r_idx = tree_idx, input_idx
+ sort_order = np.argsort(l_idx, kind="stable")
+ l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
+ if distances is not None:
+ distances = distances[sort_order]
+ else:
+ l_idx, r_idx = input_idx, tree_idx
+ else:
+ # when sindex is empty / has no valid geometries
+ l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
+ if return_distance:
+ distances = np.array([], dtype=np.float64)
+ else:
+ distances = None
+
+ if on_attribute:
+ for attr in on_attribute:
+ (l_idx, r_idx), shared_attribute_rows = _filter_shared_attribute(
+ left_df, right_df, l_idx, r_idx, attr
+ )
+ distances = distances[shared_attribute_rows]
+
+ return (l_idx, r_idx), distances
def _filter_shared_attribute(left_df, right_df, l_idx, r_idx, attribute):
@@ -217,13 +569,25 @@ def _filter_shared_attribute(left_df, right_df, l_idx, r_idx, attribute):
in the attribute column. Also returns a Boolean `shared_attribute_rows` for rows
with the same entry.
"""
- pass
-
-
-def sjoin_nearest(left_df: GeoDataFrame, right_df: GeoDataFrame, how: str=
- 'inner', max_distance: Optional[float]=None, lsuffix: str='left',
- rsuffix: str='right', distance_col: Optional[str]=None, exclusive: bool
- =False) ->GeoDataFrame:
+ shared_attribute_rows = (
+ left_df[attribute].iloc[l_idx].values == right_df[attribute].iloc[r_idx].values
+ )
+
+ l_idx = l_idx[shared_attribute_rows]
+ r_idx = r_idx[shared_attribute_rows]
+ return (l_idx, r_idx), shared_attribute_rows
+
+
+def sjoin_nearest(
+ left_df: GeoDataFrame,
+ right_df: GeoDataFrame,
+ how: str = "inner",
+ max_distance: Optional[float] = None,
+ lsuffix: str = "left",
+ rsuffix: str = "right",
+ distance_col: Optional[str] = None,
+ exclusive: bool = False,
+) -> GeoDataFrame:
"""Spatial join of two GeoDataFrames based on the distance between their geometries.
Results will include multiple output records for a single input record
@@ -301,8 +665,10 @@ def sjoin_nearest(left_df: GeoDataFrame, right_df: GeoDataFrame, how: str=
To include the distances:
- >>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, distance_col="distances")
- >>> groceries_w_communities[["Chain", "community", "distances"]].head(2)
+ >>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
+distance_col="distances")
+ >>> groceries_w_communities[["Chain", "community", \
+"distances"]].head(2)
Chain community distances
0 VIET HOA PLAZA UPTOWN 0.0
1 COUNTY FAIR FOODS MORGAN PARK 0.0
@@ -311,8 +677,10 @@ def sjoin_nearest(left_df: GeoDataFrame, right_df: GeoDataFrame, how: str=
results are equidistant (in this case zero because they intersect).
In fact, we get 4 results in total:
- >>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, distance_col="distances", how="right")
- >>> uptown_results = chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
+ >>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, \
+distance_col="distances", how="right")
+ >>> uptown_results = \
+chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
>>> uptown_results[["Chain", "community"]]
Chain community
30 VIET HOA PLAZA UPTOWN
@@ -333,4 +701,34 @@ def sjoin_nearest(left_df: GeoDataFrame, right_df: GeoDataFrame, how: str=
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
- pass
+
+ _basic_checks(left_df, right_df, how, lsuffix, rsuffix)
+
+ left_df.geometry.values.check_geographic_crs(stacklevel=1)
+ right_df.geometry.values.check_geographic_crs(stacklevel=1)
+
+ return_distance = distance_col is not None
+
+ indices, distances = _nearest_query(
+ left_df,
+ right_df,
+ max_distance,
+ how,
+ return_distance,
+ exclusive,
+ )
+ joined, distances = _frame_join(
+ left_df,
+ right_df,
+ indices,
+ distances,
+ how,
+ lsuffix,
+ rsuffix,
+ None,
+ )
+
+ if return_distance:
+ joined[distance_col] = distances
+
+ return joined
diff --git a/geopandas/tools/tests/test_clip.py b/geopandas/tools/tests/test_clip.py
index 6ccf6e29..dbdbfe6e 100644
--- a/geopandas/tools/tests/test_clip.py
+++ b/geopandas/tools/tests/test_clip.py
@@ -1,70 +1,99 @@
"""Tests for the clip module."""
+
import numpy as np
import pandas as pd
+
import shapely
-from shapely.geometry import GeometryCollection, LinearRing, LineString, MultiPoint, Point, Polygon, box
+from shapely.geometry import (
+ GeometryCollection,
+ LinearRing,
+ LineString,
+ MultiPoint,
+ Point,
+ Polygon,
+ box,
+)
+
import geopandas
from geopandas import GeoDataFrame, GeoSeries, clip
from geopandas._compat import HAS_PYPROJ
from geopandas.tools.clip import _mask_is_list_like_rectangle
+
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from pandas.testing import assert_index_equal
-mask_variants_single_rectangle = ['single_rectangle_gdf',
- 'single_rectangle_gdf_list_bounds', 'single_rectangle_gdf_tuple_bounds',
- 'single_rectangle_gdf_array_bounds']
-mask_variants_large_rectangle = ['larger_single_rectangle_gdf',
- 'larger_single_rectangle_gdf_bounds']
+
+mask_variants_single_rectangle = [
+ "single_rectangle_gdf",
+ "single_rectangle_gdf_list_bounds",
+ "single_rectangle_gdf_tuple_bounds",
+ "single_rectangle_gdf_array_bounds",
+]
+mask_variants_large_rectangle = [
+ "larger_single_rectangle_gdf",
+ "larger_single_rectangle_gdf_bounds",
+]
@pytest.fixture
def point_gdf():
"""Create a point GeoDataFrame."""
- pass
+ pts = np.array([[2, 2], [3, 4], [9, 8], [-12, -15]])
+ gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def point_gdf2():
"""Create a point GeoDataFrame."""
- pass
+ pts = np.array([[5, 5], [2, 2], [4, 4], [0, 0], [3, 3], [1, 1]])
+ gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def pointsoutside_nooverlap_gdf():
"""Create a point GeoDataFrame. Its points are all outside the single
rectangle, and its bounds are outside the single rectangle's."""
- pass
+ pts = np.array([[5, 15], [15, 15], [15, 20]])
+ gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def pointsoutside_overlap_gdf():
"""Create a point GeoDataFrame. Its points are all outside the single
rectangle, and its bounds are overlapping the single rectangle's."""
- pass
+ pts = np.array([[5, 15], [15, 15], [15, 5]])
+ gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def single_rectangle_gdf():
"""Create a single rectangle for clipping."""
- pass
+ poly_inters = Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
+ gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
+ gdf["attr2"] = "site-boundary"
+ return gdf
@pytest.fixture
def single_rectangle_gdf_tuple_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
- pass
+ return tuple(single_rectangle_gdf.total_bounds)
@pytest.fixture
def single_rectangle_gdf_list_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
- pass
+ return list(single_rectangle_gdf.total_bounds)
@pytest.fixture
def single_rectangle_gdf_array_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
- pass
+ return single_rectangle_gdf.total_bounds
@pytest.fixture
@@ -74,204 +103,382 @@ def larger_single_rectangle_gdf():
are returned when you clip polygons. This fixture is larger which
eliminates the slivers in the clip return.
"""
- pass
+ poly_inters = Polygon([(-5, -5), (-5, 15), (15, 15), (15, -5), (-5, -5)])
+ gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
+ gdf["attr2"] = ["study area"]
+ return gdf
@pytest.fixture
def larger_single_rectangle_gdf_bounds(larger_single_rectangle_gdf):
"""Bounds of the created single rectangle"""
- pass
+ return tuple(larger_single_rectangle_gdf.total_bounds)
@pytest.fixture
def buffered_locations(point_gdf):
"""Buffer points to create a multi-polygon."""
- pass
+ buffered_locs = point_gdf
+ buffered_locs["geometry"] = buffered_locs.buffer(4)
+ buffered_locs["type"] = "plot"
+ return buffered_locs
@pytest.fixture
def donut_geometry(buffered_locations, single_rectangle_gdf):
"""Make a geometry with a hole in the middle (a donut)."""
- pass
+ donut = geopandas.overlay(
+ buffered_locations, single_rectangle_gdf, how="symmetric_difference"
+ )
+ return donut
@pytest.fixture
def two_line_gdf():
"""Create Line Objects For Testing"""
- pass
+ linea = LineString([(1, 1), (2, 2), (3, 2), (5, 3)])
+ lineb = LineString([(3, 4), (5, 7), (12, 2), (10, 5), (9, 7.5)])
+ gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def multi_poly_gdf(donut_geometry):
"""Create a multi-polygon GeoDataFrame."""
- pass
+ multi_poly = donut_geometry.union_all()
+ out_df = GeoDataFrame(geometry=GeoSeries(multi_poly), crs="EPSG:3857")
+ out_df["attr"] = ["pool"]
+ return out_df
@pytest.fixture
def multi_line(two_line_gdf):
"""Create a multi-line GeoDataFrame.
This GDF has one multiline and one regular line."""
- pass
+ # Create a single and multi line object
+ multiline_feat = two_line_gdf.union_all()
+ linec = LineString([(2, 1), (3, 1), (4, 1), (5, 2)])
+ out_df = GeoDataFrame(geometry=GeoSeries([multiline_feat, linec]), crs="EPSG:3857")
+ out_df["attr"] = ["road", "stream"]
+ return out_df
@pytest.fixture
def multi_point(point_gdf):
"""Create a multi-point GeoDataFrame."""
- pass
+ multi_point = point_gdf.union_all()
+ out_df = GeoDataFrame(
+ geometry=GeoSeries(
+ [multi_point, Point(2, 5), Point(-11, -14), Point(-10, -12)]
+ ),
+ crs="EPSG:3857",
+ )
+ out_df["attr"] = ["tree", "another tree", "shrub", "berries"]
+ return out_df
@pytest.fixture
def mixed_gdf():
"""Create a Mixed Polygon and LineString For Testing"""
- pass
+ point = Point(2, 3)
+ line = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
+ poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
+ ring = LinearRing([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
+ gdf = GeoDataFrame(
+ [1, 2, 3, 4], geometry=[point, poly, line, ring], crs="EPSG:3857"
+ )
+ return gdf
@pytest.fixture
def geomcol_gdf():
"""Create a Mixed Polygon and LineString For Testing"""
- pass
+ point = Point(2, 3)
+ poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
+ coll = GeometryCollection([point, poly])
+ gdf = GeoDataFrame([1], geometry=[coll], crs="EPSG:3857")
+ return gdf
@pytest.fixture
def sliver_line():
"""Create a line that will create a point when clipped."""
- pass
+ linea = LineString([(10, 5), (13, 5), (15, 5)])
+ lineb = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
+ gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
+ return gdf
def test_not_gdf(single_rectangle_gdf):
"""Non-GeoDataFrame inputs raise attribute errors."""
- pass
+ with pytest.raises(TypeError):
+ clip((2, 3), single_rectangle_gdf)
+ with pytest.raises(TypeError):
+ clip(single_rectangle_gdf, "foobar")
+ with pytest.raises(TypeError):
+ clip(single_rectangle_gdf, (1, 2, 3))
+ with pytest.raises(TypeError):
+ clip(single_rectangle_gdf, (1, 2, 3, 4, 5))
def test_non_overlapping_geoms():
"""Test that a bounding box returns empty if the extents don't overlap"""
- pass
-
-
-@pytest.mark.parametrize('mask_fixture_name', mask_variants_single_rectangle)
+ unit_box = Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)])
+ unit_gdf = GeoDataFrame([1], geometry=[unit_box], crs="EPSG:3857")
+ non_overlapping_gdf = unit_gdf.copy()
+ non_overlapping_gdf = non_overlapping_gdf.geometry.apply(
+ lambda x: shapely.affinity.translate(x, xoff=20)
+ )
+ out = clip(unit_gdf, non_overlapping_gdf)
+ assert_geodataframe_equal(out, unit_gdf.iloc[:0])
+ out2 = clip(unit_gdf.geometry, non_overlapping_gdf)
+ assert_geoseries_equal(out2, GeoSeries(crs=unit_gdf.crs))
+
+
+@pytest.mark.parametrize("mask_fixture_name", mask_variants_single_rectangle)
class TestClipWithSingleRectangleGdf:
+ @pytest.fixture
+ def mask(self, mask_fixture_name, request):
+ return request.getfixturevalue(mask_fixture_name)
def test_returns_gdf(self, point_gdf, mask):
"""Test that function returns a GeoDataFrame (or GDF-like) object."""
- pass
+ out = clip(point_gdf, mask)
+ assert isinstance(out, GeoDataFrame)
def test_returns_series(self, point_gdf, mask):
"""Test that function returns a GeoSeries if GeoSeries is passed."""
- pass
+ out = clip(point_gdf.geometry, mask)
+ assert isinstance(out, GeoSeries)
def test_clip_points(self, point_gdf, mask):
"""Test clipping a points GDF with a generic polygon geometry."""
- pass
+ clip_pts = clip(point_gdf, mask)
+ pts = np.array([[2, 2], [3, 4], [9, 8]])
+ exp = GeoDataFrame(
+ [Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857"
+ )
+ assert_geodataframe_equal(clip_pts, exp)
def test_clip_points_geom_col_rename(self, point_gdf, mask):
"""Test clipping a points GDF with a generic polygon geometry."""
- pass
+ point_gdf_geom_col_rename = point_gdf.rename_geometry("geometry2")
+ clip_pts = clip(point_gdf_geom_col_rename, mask)
+ pts = np.array([[2, 2], [3, 4], [9, 8]])
+ exp = GeoDataFrame(
+ [Point(xy) for xy in pts],
+ columns=["geometry2"],
+ crs="EPSG:3857",
+ geometry="geometry2",
+ )
+ assert_geodataframe_equal(clip_pts, exp)
def test_clip_poly(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
- pass
+ clipped_poly = clip(buffered_locations, mask)
+ assert len(clipped_poly.geometry) == 3
+ assert all(clipped_poly.geom_type == "Polygon")
def test_clip_poly_geom_col_rename(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
- pass
+
+ poly_gdf_geom_col_rename = buffered_locations.rename_geometry("geometry2")
+ clipped_poly = clip(poly_gdf_geom_col_rename, mask)
+ assert len(clipped_poly.geometry) == 3
+ assert "geometry" not in clipped_poly.keys()
+ assert "geometry2" in clipped_poly.keys()
def test_clip_poly_series(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
- pass
+ clipped_poly = clip(buffered_locations.geometry, mask)
+ assert len(clipped_poly) == 3
+ assert all(clipped_poly.geom_type == "Polygon")
def test_clip_multipoly_keep_geom_type(self, multi_poly_gdf, mask):
"""Test a multi poly object where the return includes a sliver.
Also the bounds of the object should == the bounds of the clip object
if they fully overlap (as they do in these fixtures)."""
- pass
+ clipped = clip(multi_poly_gdf, mask, keep_geom_type=True)
+ expected_bounds = (
+ mask if _mask_is_list_like_rectangle(mask) else mask.total_bounds
+ )
+ assert np.array_equal(clipped.total_bounds, expected_bounds)
+ # Assert returned data is a not geometry collection
+ assert (clipped.geom_type.isin(["Polygon", "MultiPolygon"])).all()
def test_clip_multiline(self, multi_line, mask):
"""Test that clipping a multiline feature with a poly returns expected
output."""
- pass
+ clipped = clip(multi_line, mask)
+ assert clipped.geom_type[0] == "MultiLineString"
def test_clip_multipoint(self, multi_point, mask):
"""Clipping a multipoint feature with a polygon works as expected.
should return a geodataframe with a single multi point feature"""
- pass
+ clipped = clip(multi_point, mask)
+ assert clipped.geom_type[0] == "MultiPoint"
+ assert hasattr(clipped, "attr")
+ # All points should intersect the clip geom
+ assert len(clipped) == 2
+ clipped_mutltipoint = MultiPoint(
+ [
+ Point(2, 2),
+ Point(3, 4),
+ Point(9, 8),
+ ]
+ )
+ assert clipped.iloc[0].geometry.wkt == clipped_mutltipoint.wkt
+ shape_for_points = (
+ box(*mask) if _mask_is_list_like_rectangle(mask) else mask.union_all()
+ )
+ assert all(clipped.intersects(shape_for_points))
def test_clip_lines(self, two_line_gdf, mask):
"""Test what happens when you give the clip_extent a line GDF."""
- pass
+ clip_line = clip(two_line_gdf, mask)
+ assert len(clip_line.geometry) == 2
def test_mixed_geom(self, mixed_gdf, mask):
"""Test clipping a mixed GeoDataFrame"""
- pass
+ clipped = clip(mixed_gdf, mask)
+ assert (
+ clipped.geom_type[0] == "Point"
+ and clipped.geom_type[1] == "Polygon"
+ and clipped.geom_type[2] == "LineString"
+ )
def test_mixed_series(self, mixed_gdf, mask):
"""Test clipping a mixed GeoSeries"""
- pass
+ clipped = clip(mixed_gdf.geometry, mask)
+ assert (
+ clipped.geom_type[0] == "Point"
+ and clipped.geom_type[1] == "Polygon"
+ and clipped.geom_type[2] == "LineString"
+ )
def test_clip_with_line_extra_geom(self, sliver_line, mask):
"""When the output of a clipped line returns a geom collection,
and keep_geom_type is True, no geometry collections should be returned."""
- pass
+ clipped = clip(sliver_line, mask, keep_geom_type=True)
+ assert len(clipped.geometry) == 1
+ # Assert returned data is a not geometry collection
+ assert not (clipped.geom_type == "GeometryCollection").any()
def test_clip_no_box_overlap(self, pointsoutside_nooverlap_gdf, mask):
"""Test clip when intersection is empty and boxes do not overlap."""
- pass
+ clipped = clip(pointsoutside_nooverlap_gdf, mask)
+ assert len(clipped) == 0
def test_clip_box_overlap(self, pointsoutside_overlap_gdf, mask):
"""Test clip when intersection is empty and boxes do overlap."""
- pass
+ clipped = clip(pointsoutside_overlap_gdf, mask)
+ assert len(clipped) == 0
def test_warning_extra_geoms_mixed(self, mixed_gdf, mask):
"""Test the correct warnings are raised if keep_geom_type is
called on a mixed GDF"""
- pass
+ with pytest.warns(UserWarning):
+ clip(mixed_gdf, mask, keep_geom_type=True)
def test_warning_geomcoll(self, geomcol_gdf, mask):
"""Test the correct warnings are raised if keep_geom_type is
called on a GDF with GeometryCollection"""
- pass
+ with pytest.warns(UserWarning):
+ clip(geomcol_gdf, mask, keep_geom_type=True)
def test_clip_line_keep_slivers(sliver_line, single_rectangle_gdf):
"""Test the correct output if a point is returned
from a line only geometry type."""
- pass
+ clipped = clip(sliver_line, single_rectangle_gdf)
+ # Assert returned data is a geometry collection given sliver geoms
+ assert "Point" == clipped.geom_type[0]
+ assert "LineString" == clipped.geom_type[1]
def test_clip_multipoly_keep_slivers(multi_poly_gdf, single_rectangle_gdf):
"""Test a multi poly object where the return includes a sliver.
Also the bounds of the object should == the bounds of the clip object
if they fully overlap (as they do in these fixtures)."""
- pass
+ clipped = clip(multi_poly_gdf, single_rectangle_gdf)
+ assert np.array_equal(clipped.total_bounds, single_rectangle_gdf.total_bounds)
+ # Assert returned data is a geometry collection given sliver geoms
+ assert "GeometryCollection" in clipped.geom_type[0]
+
+
+@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
+def test_warning_crs_mismatch(point_gdf, single_rectangle_gdf):
+ with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
+ clip(point_gdf, single_rectangle_gdf.to_crs(4326))
def test_clip_with_polygon(single_rectangle_gdf):
"""Test clip when using a shapely object"""
- pass
+ polygon = Polygon([(0, 0), (5, 12), (10, 0), (0, 0)])
+ clipped = clip(single_rectangle_gdf, polygon)
+ exp_poly = polygon.intersection(
+ Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
+ )
+ exp = GeoDataFrame([1], geometry=[exp_poly], crs="EPSG:3857")
+ exp["attr2"] = "site-boundary"
+ assert_geodataframe_equal(clipped, exp)
def test_clip_with_multipolygon(buffered_locations, single_rectangle_gdf):
"""Test clipping a polygon with a multipolygon."""
- pass
-
-
-@pytest.mark.parametrize('mask_fixture_name', mask_variants_large_rectangle)
-def test_clip_single_multipoly_no_extra_geoms(buffered_locations,
- mask_fixture_name, request):
+ multi = buffered_locations.dissolve(by="type").reset_index()
+ clipped = clip(single_rectangle_gdf, multi)
+ assert clipped.geom_type[0] == "Polygon"
+
+
+@pytest.mark.parametrize(
+ "mask_fixture_name",
+ mask_variants_large_rectangle,
+)
+def test_clip_single_multipoly_no_extra_geoms(
+ buffered_locations, mask_fixture_name, request
+):
"""When clipping a multi-polygon feature, no additional geom types
should be returned."""
- pass
-
-
-@pytest.mark.filterwarnings('ignore:All-NaN slice encountered')
-@pytest.mark.parametrize('mask', [Polygon(), (np.nan,) * 4, (np.nan, 0, np.
- nan, 1), GeoSeries([Polygon(), Polygon()], crs='EPSG:3857'), GeoSeries(
- [Polygon(), Polygon()], crs='EPSG:3857').to_frame(), GeoSeries([], crs=
- 'EPSG:3857'), GeoSeries([], crs='EPSG:3857').to_frame()])
+ masks = request.getfixturevalue(mask_fixture_name)
+ multi = buffered_locations.dissolve(by="type").reset_index()
+ clipped = clip(multi, masks)
+ assert clipped.geom_type[0] == "Polygon"
+
+
+@pytest.mark.filterwarnings("ignore:All-NaN slice encountered")
+@pytest.mark.parametrize(
+ "mask",
+ [
+ Polygon(),
+ (np.nan,) * 4,
+ (np.nan, 0, np.nan, 1),
+ GeoSeries([Polygon(), Polygon()], crs="EPSG:3857"),
+ GeoSeries([Polygon(), Polygon()], crs="EPSG:3857").to_frame(),
+ GeoSeries([], crs="EPSG:3857"),
+ GeoSeries([], crs="EPSG:3857").to_frame(),
+ ],
+)
def test_clip_empty_mask(buffered_locations, mask):
"""Test that clipping with empty mask returns an empty result."""
- pass
+ clipped = clip(buffered_locations, mask)
+ assert_geodataframe_equal(
+ clipped,
+ GeoDataFrame([], columns=["geometry", "type"], crs="EPSG:3857"),
+ check_index_type=False,
+ )
+ clipped = clip(buffered_locations.geometry, mask)
+ assert_geoseries_equal(clipped, GeoSeries([], crs="EPSG:3857"))
def test_clip_sorting(point_gdf2):
"""Test the sorting kwarg in clip"""
- pass
+ bbox = shapely.geometry.box(0, 0, 2, 2)
+ unsorted_clipped_gdf = point_gdf2.clip(bbox)
+ sorted_clipped_gdf = point_gdf2.clip(bbox, sort=True)
+
+ expected_sorted_index = pd.Index([1, 3, 5])
+
+ assert not (sorted(unsorted_clipped_gdf.index) == unsorted_clipped_gdf.index).all()
+ assert (sorted(sorted_clipped_gdf.index) == sorted_clipped_gdf.index).all()
+ assert_index_equal(expected_sorted_index, sorted_clipped_gdf.index)
diff --git a/geopandas/tools/tests/test_hilbert_curve.py b/geopandas/tools/tests/test_hilbert_curve.py
index 38de871e..3d79a84c 100644
--- a/geopandas/tools/tests/test_hilbert_curve.py
+++ b/geopandas/tools/tests/test_hilbert_curve.py
@@ -1,6 +1,76 @@
import numpy as np
+
from shapely.geometry import Point
from shapely.wkt import loads
+
import geopandas
+
import pytest
from pandas.testing import assert_series_equal
+
+
+def test_hilbert_distance():
+ # test the actual Hilbert Code algorithm against some hardcoded values
+ geoms = geopandas.GeoSeries.from_wkt(
+ [
+ "POINT (0 0)",
+ "POINT (1 1)",
+ "POINT (1 0)",
+ "POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))",
+ ]
+ )
+ result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=2)
+ assert result.tolist() == [0, 10, 15, 2]
+
+ result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=3)
+ assert result.tolist() == [0, 42, 63, 10]
+
+ result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=16)
+ assert result.tolist() == [0, 2863311530, 4294967295, 715827882]
+
+
+@pytest.fixture
+def geoseries_points():
+ p1 = Point(1, 2)
+ p2 = Point(2, 3)
+ p3 = Point(3, 4)
+ p4 = Point(4, 1)
+ return geopandas.GeoSeries([p1, p2, p3, p4])
+
+
+def test_hilbert_distance_level(geoseries_points):
+ with pytest.raises(ValueError):
+ geoseries_points.hilbert_distance(level=20)
+
+
+def test_specified_total_bounds(geoseries_points):
+ result = geoseries_points.hilbert_distance(
+ total_bounds=geoseries_points.total_bounds
+ )
+ expected = geoseries_points.hilbert_distance()
+ assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "empty",
+ [
+ None,
+ loads("POLYGON EMPTY"),
+ ],
+)
+def test_empty(geoseries_points, empty):
+ s = geoseries_points
+ s.iloc[-1] = empty
+ with pytest.raises(
+ ValueError, match="cannot be computed on a GeoSeries with empty"
+ ):
+ s.hilbert_distance()
+
+
+def test_zero_width():
+ # special case of all points on the same line -> avoid warnings because
+ # of division by 0 and introducing NaN
+ s = geopandas.GeoSeries([Point(0, 0), Point(0, 2), Point(0, 1)])
+ with np.errstate(all="raise"):
+ result = s.hilbert_distance()
+ assert np.array(result).argsort().tolist() == [0, 2, 1]
diff --git a/geopandas/tools/tests/test_random.py b/geopandas/tools/tests/test_random.py
index ada7295a..a8d9a4fb 100644
--- a/geopandas/tools/tests/test_random.py
+++ b/geopandas/tools/tests/test_random.py
@@ -1,4 +1,67 @@
import numpy
+
import geopandas
from geopandas.tools._random import uniform
+
import pytest
+
+
+@pytest.fixture
+def multipolygons(nybb_filename):
+ return geopandas.read_file(nybb_filename).geometry
+
+
+@pytest.fixture
+def polygons(multipolygons):
+ return multipolygons.explode(ignore_index=True).geometry
+
+
+@pytest.fixture
+def multilinestrings(multipolygons):
+ return multipolygons.boundary
+
+
+@pytest.fixture
+def linestrings(polygons):
+ return polygons.boundary
+
+
+@pytest.fixture
+def points(multipolygons):
+ return multipolygons.centroid
+
+
+@pytest.mark.parametrize("size", [10, 100])
+@pytest.mark.parametrize(
+ "geom_fixture", ["multipolygons", "polygons", "multilinestrings", "linestrings"]
+)
+def test_uniform(geom_fixture, size, request):
+ geom = request.getfixturevalue(geom_fixture)[0]
+ sample = uniform(geom, size=size, rng=1)
+ sample_series = (
+ geopandas.GeoSeries(sample).explode(index_parts=True).reset_index(drop=True)
+ )
+ assert len(sample_series) == size
+ sample_in_geom = sample_series.buffer(0.00000001).sindex.query(
+ geom, predicate="intersects"
+ )
+ assert len(sample_in_geom) == size
+
+
+def test_uniform_unsupported(points):
+ with pytest.warns(UserWarning, match="Sampling is not supported"):
+ sample = uniform(points[0], size=10, rng=1)
+ assert sample.is_empty
+
+
+def test_uniform_generator(polygons):
+ sample = uniform(polygons[0], size=10, rng=1)
+ sample2 = uniform(polygons[0], size=10, rng=1)
+ assert sample.equals(sample2)
+
+ generator = numpy.random.default_rng(seed=1)
+ gen_sample = uniform(polygons[0], size=10, rng=generator)
+ gen_sample2 = uniform(polygons[0], size=10, rng=generator)
+
+ assert sample.equals(gen_sample)
+ assert not sample.equals(gen_sample2)
diff --git a/geopandas/tools/tests/test_sjoin.py b/geopandas/tools/tests/test_sjoin.py
index a4a880cd..0e44f87b 100644
--- a/geopandas/tools/tests/test_sjoin.py
+++ b/geopandas/tools/tests/test_sjoin.py
@@ -1,33 +1,1352 @@
import math
from typing import Sequence
+
import numpy as np
import pandas as pd
+
import shapely
from shapely.geometry import GeometryCollection, Point, Polygon, box
+
import geopandas
import geopandas._compat as compat
-from geopandas import GeoDataFrame, GeoSeries, points_from_xy, read_file, sjoin, sjoin_nearest
+from geopandas import (
+ GeoDataFrame,
+ GeoSeries,
+ points_from_xy,
+ read_file,
+ sjoin,
+ sjoin_nearest,
+)
+
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
+@pytest.fixture()
+def dfs(request):
+ polys1 = GeoSeries(
+ [
+ Polygon([(0, 0), (5, 0), (5, 5), (0, 5)]),
+ Polygon([(5, 5), (6, 5), (6, 6), (5, 6)]),
+ Polygon([(6, 0), (9, 0), (9, 3), (6, 3)]),
+ ]
+ )
+
+ polys2 = GeoSeries(
+ [
+ Polygon([(1, 1), (4, 1), (4, 4), (1, 4)]),
+ Polygon([(4, 4), (7, 4), (7, 7), (4, 7)]),
+ Polygon([(7, 7), (10, 7), (10, 10), (7, 10)]),
+ ]
+ )
+
+ df1 = GeoDataFrame({"geometry": polys1, "df1": [0, 1, 2]})
+ df2 = GeoDataFrame({"geometry": polys2, "df2": [3, 4, 5]})
+
+ if request.param == "string-index":
+ df1.index = ["a", "b", "c"]
+ df2.index = ["d", "e", "f"]
+
+ if request.param == "named-index":
+ df1.index.name = "df1_ix"
+ df2.index.name = "df2_ix"
+
+ if request.param == "multi-index":
+ i1 = ["a", "b", "c"]
+ i2 = ["d", "e", "f"]
+ df1 = df1.set_index([i1, i2])
+ df2 = df2.set_index([i2, i1])
+
+ if request.param == "named-multi-index":
+ i1 = ["a", "b", "c"]
+ i2 = ["d", "e", "f"]
+ df1 = df1.set_index([i1, i2])
+ df2 = df2.set_index([i2, i1])
+ df1.index.names = ["df1_ix1", "df1_ix2"]
+ df2.index.names = ["df2_ix1", "df2_ix2"]
+
+ # construction expected frames
+ expected = {}
+
+ part1 = df1.copy().reset_index().rename(columns={"index": "index_left"})
+ part2 = (
+ df2.copy()
+ .iloc[[0, 1, 1, 2]]
+ .reset_index()
+ .rename(columns={"index": "index_right"})
+ )
+ part1["_merge"] = [0, 1, 2]
+ part2["_merge"] = [0, 0, 1, 3]
+ exp = pd.merge(part1, part2, on="_merge", how="outer")
+ expected["intersects"] = exp.drop("_merge", axis=1).copy()
+
+ part1 = df1.copy().reset_index().rename(columns={"index": "index_left"})
+ part2 = df2.copy().reset_index().rename(columns={"index": "index_right"})
+ part1["_merge"] = [0, 1, 2]
+ part2["_merge"] = [0, 3, 3]
+ exp = pd.merge(part1, part2, on="_merge", how="outer")
+ expected["contains"] = exp.drop("_merge", axis=1).copy()
+
+ part1["_merge"] = [0, 1, 2]
+ part2["_merge"] = [3, 1, 3]
+ exp = pd.merge(part1, part2, on="_merge", how="outer")
+ expected["within"] = exp.drop("_merge", axis=1).copy()
+
+ return [request.param, df1, df2, expected]
+
+
+@pytest.fixture()
+def dfs_shared_attribute():
+ geo_left = [
+ Point(0, 0),
+ Point(1, 1),
+ Point(2, 2),
+ Point(3, 3),
+ Point(4, 4),
+ Point(5, 5),
+ Point(6, 6),
+ Point(7, 7),
+ ]
+ geo_right = [
+ Point(0, 0),
+ Point(1, 1),
+ Point(2, 2),
+ Point(3, 3),
+ Point(4, 4),
+ Point(5, 5),
+ Point(6, 6),
+ Point(7, 7),
+ ]
+ attr_tracker = ["A", "B", "C", "D", "E", "F", "G", "H"]
+
+ left_gdf = geopandas.GeoDataFrame(
+ {
+ "geometry": geo_left,
+ "attr_tracker": attr_tracker,
+ "duplicate_column": [0, 1, 2, 3, 4, 5, 6, 7],
+ "attr1": [True, True, True, True, True, True, True, True],
+ "attr2": [True, True, True, True, True, True, True, True],
+ }
+ )
+
+ right_gdf = geopandas.GeoDataFrame(
+ {
+ "geometry": geo_right,
+ "duplicate_column": [0, 1, 2, 3, 4, 5, 6, 7],
+ "attr1": [True, True, False, False, True, True, False, False],
+ "attr2": [True, True, False, False, False, False, False, False],
+ }
+ )
+
+ return left_gdf, right_gdf
+
+
class TestSpatialJoin:
- pass
+ @pytest.mark.parametrize(
+ "how, lsuffix, rsuffix, expected_cols",
+ [
+ ("left", "left", "right", {"col_left", "col_right", "index_right"}),
+ ("inner", "left", "right", {"col_left", "col_right", "index_right"}),
+ ("right", "left", "right", {"col_left", "col_right", "index_left"}),
+ ("left", "lft", "rgt", {"col_lft", "col_rgt", "index_rgt"}),
+ ("inner", "lft", "rgt", {"col_lft", "col_rgt", "index_rgt"}),
+ ("right", "lft", "rgt", {"col_lft", "col_rgt", "index_lft"}),
+ ],
+ )
+ def test_suffixes(self, how: str, lsuffix: str, rsuffix: str, expected_cols):
+ left = GeoDataFrame({"col": [1], "geometry": [Point(0, 0)]})
+ right = GeoDataFrame({"col": [1], "geometry": [Point(0, 0)]})
+ joined = sjoin(left, right, how=how, lsuffix=lsuffix, rsuffix=rsuffix)
+ assert set(joined.columns) == expected_cols | {"geometry"}
+
+ @pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
+ @pytest.mark.parametrize("dfs", ["default-index", "string-index"], indirect=True)
+ def test_crs_mismatch(self, dfs):
+ index, df1, df2, expected = dfs
+ df1.crs = "epsg:4326"
+ with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
+ sjoin(df1, df2)
+
+ @pytest.mark.parametrize("dfs", ["default-index"], indirect=True)
+ def test_unknown_kwargs(self, dfs):
+ _, df1, df2, _ = dfs
+ with pytest.raises(
+ TypeError,
+ match=r"sjoin\(\) got an unexpected keyword argument 'extra_param'",
+ ):
+ sjoin(df1, df2, extra_param="test")
+
+ @pytest.mark.parametrize(
+ "dfs",
+ [
+ "default-index",
+ "string-index",
+ "named-index",
+ "multi-index",
+ "named-multi-index",
+ ],
+ indirect=True,
+ )
+ @pytest.mark.parametrize("predicate", ["intersects", "contains", "within"])
+ def test_inner(self, predicate, dfs):
+ index, df1, df2, expected = dfs
+
+ res = sjoin(df1, df2, how="inner", predicate=predicate)
+ exp = expected[predicate].dropna().copy()
+ exp = exp.drop("geometry_y", axis=1).rename(columns={"geometry_x": "geometry"})
+ exp[["df1", "df2"]] = exp[["df1", "df2"]].astype("int64")
+ if index == "default-index":
+ exp[["index_left", "index_right"]] = exp[
+ ["index_left", "index_right"]
+ ].astype("int64")
+ if index == "named-index":
+ exp[["df1_ix", "df2_ix"]] = exp[["df1_ix", "df2_ix"]].astype("int64")
+ exp = exp.set_index("df1_ix")
+ if index in ["default-index", "string-index"]:
+ exp = exp.set_index("index_left")
+ exp.index.name = None
+ if index == "multi-index":
+ exp = exp.set_index(["level_0_x", "level_1_x"]).rename(
+ columns={"level_0_y": "index_right0", "level_1_y": "index_right1"}
+ )
+ exp.index.names = df1.index.names
+ if index == "named-multi-index":
+ exp = exp.set_index(["df1_ix1", "df1_ix2"])
+ assert_frame_equal(res, exp)
+
+ @pytest.mark.parametrize(
+ "dfs",
+ [
+ "default-index",
+ "string-index",
+ "named-index",
+ "multi-index",
+ "named-multi-index",
+ ],
+ indirect=True,
+ )
+ @pytest.mark.parametrize("predicate", ["intersects", "contains", "within"])
+ def test_left(self, predicate, dfs):
+ index, df1, df2, expected = dfs
+
+ res = sjoin(df1, df2, how="left", predicate=predicate)
+
+ if index in ["default-index", "string-index"]:
+ exp = expected[predicate].dropna(subset=["index_left"]).copy()
+ elif index == "named-index":
+ exp = expected[predicate].dropna(subset=["df1_ix"]).copy()
+ elif index == "multi-index":
+ exp = expected[predicate].dropna(subset=["level_0_x"]).copy()
+ elif index == "named-multi-index":
+ exp = expected[predicate].dropna(subset=["df1_ix1"]).copy()
+ exp = exp.drop("geometry_y", axis=1).rename(columns={"geometry_x": "geometry"})
+ exp["df1"] = exp["df1"].astype("int64")
+ if index == "default-index":
+ exp["index_left"] = exp["index_left"].astype("int64")
+ # TODO: in result the dtype is object
+ res["index_right"] = res["index_right"].astype(float)
+ elif index == "named-index":
+ exp[["df1_ix"]] = exp[["df1_ix"]].astype("int64")
+ exp = exp.set_index("df1_ix")
+ if index in ["default-index", "string-index"]:
+ exp = exp.set_index("index_left")
+ exp.index.name = None
+ if index == "multi-index":
+ exp = exp.set_index(["level_0_x", "level_1_x"]).rename(
+ columns={"level_0_y": "index_right0", "level_1_y": "index_right1"}
+ )
+ exp.index.names = df1.index.names
+ if index == "named-multi-index":
+ exp = exp.set_index(["df1_ix1", "df1_ix2"])
+
+ assert_frame_equal(res, exp)
+
+ def test_empty_join(self):
+ # Check joins resulting in empty gdfs.
+ polygons = geopandas.GeoDataFrame(
+ {
+ "col2": [1, 2],
+ "geometry": [
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+ Polygon([(1, 0), (2, 0), (2, 1), (1, 1)]),
+ ],
+ }
+ )
+ not_in = geopandas.GeoDataFrame({"col1": [1], "geometry": [Point(-0.5, 0.5)]})
+ empty = sjoin(not_in, polygons, how="left", predicate="intersects")
+ assert empty.index_right.isnull().all()
+ empty = sjoin(not_in, polygons, how="right", predicate="intersects")
+ assert empty.index_left.isnull().all()
+ empty = sjoin(not_in, polygons, how="inner", predicate="intersects")
+ assert empty.empty
+
+ @pytest.mark.parametrize(
+ "predicate",
+ [
+ "contains",
+ "contains_properly",
+ "covered_by",
+ "covers",
+ "crosses",
+ "intersects",
+ "touches",
+ "within",
+ ],
+ )
+ @pytest.mark.parametrize(
+ "empty",
+ [
+ GeoDataFrame(geometry=[GeometryCollection(), GeometryCollection()]),
+ GeoDataFrame(geometry=GeoSeries()),
+ ],
+ )
+ def test_join_with_empty(self, predicate, empty):
+ # Check joins with empty geometry columns/dataframes.
+ polygons = geopandas.GeoDataFrame(
+ {
+ "col2": [1, 2],
+ "geometry": [
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+ Polygon([(1, 0), (2, 0), (2, 1), (1, 1)]),
+ ],
+ }
+ )
+ result = sjoin(empty, polygons, how="left", predicate=predicate)
+ assert result.index_right.isnull().all()
+ result = sjoin(empty, polygons, how="right", predicate=predicate)
+ assert result.index_left.isnull().all()
+ result = sjoin(empty, polygons, how="inner", predicate=predicate)
+ assert result.empty
+
+ @pytest.mark.parametrize("dfs", ["default-index", "string-index"], indirect=True)
+ def test_sjoin_invalid_args(self, dfs):
+ index, df1, df2, expected = dfs
+
+ with pytest.raises(ValueError, match="'left_df' should be GeoDataFrame"):
+ sjoin(df1.geometry, df2)
+
+ with pytest.raises(ValueError, match="'right_df' should be GeoDataFrame"):
+ sjoin(df1, df2.geometry)
+
+ @pytest.mark.parametrize(
+ "dfs",
+ [
+ "default-index",
+ "string-index",
+ "named-index",
+ "multi-index",
+ "named-multi-index",
+ ],
+ indirect=True,
+ )
+ @pytest.mark.parametrize("predicate", ["intersects", "contains", "within"])
+ def test_right(self, predicate, dfs):
+ index, df1, df2, expected = dfs
+
+ res = sjoin(df1, df2, how="right", predicate=predicate)
+
+ if index in ["default-index", "string-index"]:
+ exp = expected[predicate].dropna(subset=["index_right"]).copy()
+ elif index == "named-index":
+ exp = expected[predicate].dropna(subset=["df2_ix"]).copy()
+ elif index == "multi-index":
+ exp = expected[predicate].dropna(subset=["level_0_y"]).copy()
+ elif index == "named-multi-index":
+ exp = expected[predicate].dropna(subset=["df2_ix1"]).copy()
+ exp = exp.drop("geometry_x", axis=1).rename(columns={"geometry_y": "geometry"})
+ exp["df2"] = exp["df2"].astype("int64")
+ if index == "default-index":
+ exp["index_right"] = exp["index_right"].astype("int64")
+ res["index_left"] = res["index_left"].astype(float)
+ elif index == "named-index":
+ exp[["df2_ix"]] = exp[["df2_ix"]].astype("int64")
+ exp = exp.set_index("df2_ix")
+ if index in ["default-index", "string-index"]:
+ exp = exp.set_index("index_right")
+ exp = exp.reindex(columns=res.columns)
+ exp.index.name = None
+ if index == "multi-index":
+ exp = exp.set_index(["level_0_y", "level_1_y"]).rename(
+ columns={"level_0_x": "index_left0", "level_1_x": "index_left1"}
+ )
+ exp.index.names = df2.index.names
+ if index == "named-multi-index":
+ exp = exp.set_index(["df2_ix1", "df2_ix2"])
+
+ if predicate == "within":
+ exp = exp.sort_index()
+
+ assert_frame_equal(res, exp, check_index_type=False)
+
+ @pytest.mark.skipif(not compat.GEOS_GE_310, reason="`dwithin` requires GEOS 3.10")
+ @pytest.mark.parametrize("how", ["inner"])
+ @pytest.mark.parametrize(
+ "geo_left, geo_right, expected_left, expected_right, distance",
+ [
+ (
+ # Distance is number, 2x1
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1)],
+ [0, 1],
+ [0, 0],
+ math.sqrt(2),
+ ),
+ # Distance is number, 2x2
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(0, 0), Point(1, 1)],
+ [0, 1, 0, 1],
+ [0, 0, 1, 1],
+ math.sqrt(2),
+ ),
+ # Distance is array, matches len(left)
+ (
+ [Point(0, 0), Point(0, 0), Point(-1, -1)],
+ [Point(1, 1)],
+ [1, 2],
+ [0, 0],
+ [0, math.sqrt(2), math.sqrt(8)],
+ ),
+ # Distance is np.array, matches len(left),
+ # inner join sorts the right GeoDataFrame
+ (
+ [Point(0, 0), Point(0, 0), Point(-1, -1)],
+ [Point(1, 1), Point(0.5, 0.5)],
+ [1, 2, 1, 2],
+ [1, 1, 0, 0],
+ np.array([0, math.sqrt(2), math.sqrt(8)]),
+ ),
+ ],
+ )
+ def test_sjoin_dwithin(
+ self,
+ geo_left,
+ geo_right,
+ expected_left: Sequence[int],
+ expected_right: Sequence[int],
+ distance,
+ how,
+ ):
+ left = geopandas.GeoDataFrame({"geometry": geo_left})
+ right = geopandas.GeoDataFrame({"geometry": geo_right})
+ expected_gdf = left.iloc[expected_left].copy()
+ expected_gdf["index_right"] = expected_right
+ joined = sjoin(left, right, how=how, predicate="dwithin", distance=distance)
+ assert_frame_equal(expected_gdf.sort_index(), joined.sort_index())
+
+ # GH3239
+ @pytest.mark.parametrize(
+ "predicate",
+ [
+ "contains",
+ "contains_properly",
+ "covered_by",
+ "covers",
+ "crosses",
+ "intersects",
+ "touches",
+ "within",
+ ],
+ )
+ def test_sjoin_left_order(self, predicate):
+ # a set of points in random order -> that order should be preserved
+ # with a left join
+ pts = GeoDataFrame(
+ geometry=points_from_xy([0.1, 0.4, 0.3, 0.7], [0.8, 0.6, 0.9, 0.1])
+ )
+ polys = GeoDataFrame(
+ {"id": [1, 2, 3, 4]},
+ geometry=[
+ box(0, 0, 0.5, 0.5),
+ box(0, 0.5, 0.5, 1),
+ box(0.5, 0, 1, 0.5),
+ box(0.5, 0.5, 1, 1),
+ ],
+ )
+
+ joined = sjoin(pts, polys, predicate=predicate, how="left")
+ assert_index_equal(joined.index, pts.index)
+
+ def test_sjoin_shared_attribute(self, naturalearth_lowres, naturalearth_cities):
+ countries = read_file(naturalearth_lowres)
+ cities = read_file(naturalearth_cities)
+ countries = countries[["geometry", "name"]].rename(columns={"name": "country"})
+
+ # Add first letter of country/city as an attribute column to be compared
+ countries["firstLetter"] = countries["country"].astype(str).str[0]
+ cities["firstLetter"] = cities["name"].astype(str).str[0]
+
+ result = sjoin(cities, countries, on_attribute="firstLetter")
+ assert (
+ result["country"].astype(str).str[0] == result["name"].astype(str).str[0]
+ ).all()
+ assert result.shape == (23, 5)
+
+ @pytest.mark.parametrize(
+ "attr1_key_change_dict, attr2_key_change_dict",
+ [
+ pytest.param(
+ {True: "merge", False: "no_merge"},
+ {True: "merge", False: "no_merge"},
+ id="merge on string attributes",
+ ),
+ pytest.param(
+ {True: 2, False: 1},
+ {True: 2, False: 1},
+ id="merge on integer attributes",
+ ),
+ pytest.param(
+ {True: True, False: False},
+ {True: True, False: False},
+ id="merge on boolean attributes",
+ ),
+ pytest.param(
+ {True: True, False: False},
+ {True: "merge", False: "no_merge"},
+ id="merge on mixed attributes",
+ ),
+ ],
+ )
+ def test_sjoin_multiple_attributes_datatypes(
+ self, dfs_shared_attribute, attr1_key_change_dict, attr2_key_change_dict
+ ):
+ left_gdf, right_gdf = dfs_shared_attribute
+ left_gdf["attr1"] = left_gdf["attr1"].map(attr1_key_change_dict)
+ left_gdf["attr2"] = left_gdf["attr2"].map(attr2_key_change_dict)
+ right_gdf["attr1"] = right_gdf["attr1"].map(attr1_key_change_dict)
+ right_gdf["attr2"] = right_gdf["attr2"].map(attr2_key_change_dict)
+
+ joined = sjoin(left_gdf, right_gdf, on_attribute=("attr1", "attr2"))
+ assert (["A", "B"] == joined["attr_tracker"].values).all()
+
+ def test_sjoin_multiple_attributes_check_header(self, dfs_shared_attribute):
+ left_gdf, right_gdf = dfs_shared_attribute
+ joined = sjoin(left_gdf, right_gdf, on_attribute=["attr1"])
+
+ assert (["A", "B", "E", "F"] == joined["attr_tracker"].values).all()
+ assert {"attr2_left", "attr2_right", "attr1"}.issubset(joined.columns)
+ assert "attr1_left" not in joined
+
+ def test_sjoin_error_column_does_not_exist(self, dfs_shared_attribute):
+ left_gdf, right_gdf = dfs_shared_attribute
+ right_gdf_dropped_attr = right_gdf.drop("attr1", axis=1)
+ left_gdf_dropped_attr = left_gdf.drop("attr1", axis=1)
+
+ with pytest.raises(
+ ValueError,
+ match="Expected column attr1 is missing from the right dataframe.",
+ ):
+ sjoin(left_gdf, right_gdf_dropped_attr, on_attribute="attr1")
+
+ with pytest.raises(
+ ValueError,
+ match="Expected column attr1 is missing from the left dataframe.",
+ ):
+ sjoin(left_gdf_dropped_attr, right_gdf, on_attribute="attr1")
+
+ with pytest.raises(
+ ValueError,
+ match="Expected column attr1 is missing from both of the dataframes.",
+ ):
+ sjoin(left_gdf_dropped_attr, right_gdf_dropped_attr, on_attribute="attr1")
+
+ def test_sjoin_error_use_geometry_column(self, dfs_shared_attribute):
+ left_gdf, right_gdf = dfs_shared_attribute
+ with pytest.raises(
+ ValueError,
+ match="Active geometry column cannot be used as an input for "
+ "on_attribute parameter.",
+ ):
+ sjoin(left_gdf, right_gdf, on_attribute="geometry")
+ with pytest.raises(
+ ValueError,
+ match="Active geometry column cannot be used as an input for "
+ "on_attribute parameter.",
+ ):
+ sjoin(left_gdf, right_gdf, on_attribute=["attr1", "geometry"])
class TestIndexNames:
- pass
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_preserve_index_names(self, how):
+ # preserve names of both left and right index
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame({"geometry": geoms}, index=pd.Index([1, 2], name="myidx1"))
+ df2 = GeoDataFrame(
+ {"geometry": geoms}, index=pd.Index(["a", "b"], name="myidx2")
+ )
+ result = sjoin(df1, df2, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {"myidx1": [1, 2], "geometry": geoms, "myidx2": ["a", "b"]}
+ ).set_index("myidx1")
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {"myidx2": ["a", "b"], "myidx1": [1, 2], "geometry": geoms},
+ ).set_index("myidx2")
+ assert_geodataframe_equal(result, expected)
+
+ # but also add suffixes if both left and right have the same index
+ df1.index.name = "myidx"
+ df2.index.name = "myidx"
+ result = sjoin(df1, df2, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {"myidx_left": [1, 2], "geometry": geoms, "myidx_right": ["a", "b"]}
+ ).set_index("myidx_left")
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {"myidx_right": ["a", "b"], "myidx_left": [1, 2], "geometry": geoms},
+ ).set_index("myidx_right")
+ assert_geodataframe_equal(result, expected)
+
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_preserve_index_names_multiindex(self, how):
+ # preserve names of both left and right index
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame(
+ {"geometry": geoms},
+ index=pd.MultiIndex.from_tuples(
+ [("a", 1), ("b", 2)], names=["myidx1", "level2"]
+ ),
+ )
+ df2 = GeoDataFrame(
+ {"geometry": geoms},
+ index=pd.MultiIndex.from_tuples(
+ [("c", 3), ("d", 4)], names=["myidx2", None]
+ ),
+ )
+ result = sjoin(df1, df2, how=how)
+ expected_base = GeoDataFrame(
+ {
+ "myidx1": ["a", "b"],
+ "level2": [1, 2],
+ "geometry": geoms,
+ "myidx2": ["c", "d"],
+ "index_right1": [3, 4],
+ }
+ )
+ if how in ("inner", "left"):
+ expected = expected_base.set_index(["myidx1", "level2"])
+ else:
+ # right join
+ expected = expected_base.set_index(["myidx2", "index_right1"])
+ # if it was originally None, that is preserved
+ expected.index.names = ["myidx2", None]
+ assert_geodataframe_equal(result, expected)
+
+ # but also add suffixes if both left and right have the same index
+ df1.index.names = ["myidx", "level2"]
+ df2.index.names = ["myidx", None]
+ result = sjoin(df1, df2, how=how)
+ expected_base = GeoDataFrame(
+ {
+ "myidx_left": ["a", "b"],
+ "level2": [1, 2],
+ "geometry": geoms,
+ "myidx_right": ["c", "d"],
+ "index_right1": [3, 4],
+ }
+ )
+ if how in ("inner", "left"):
+ expected = expected_base.set_index(["myidx_left", "level2"])
+ else:
+ # right join
+ expected = expected_base.set_index(["myidx_right", "index_right1"])
+ # if it was originally None, that is preserved
+ expected.index.names = ["myidx_right", None]
+ assert_geodataframe_equal(result, expected)
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_duplicate_column_index_name(self, how):
+ # case where a left column and the right index have the same name or the
+ # other way around -> correctly add suffix or preserve index name
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame({"myidx": [1, 2], "geometry": geoms})
+ df2 = GeoDataFrame(
+ {"geometry": geoms}, index=pd.Index(["a", "b"], name="myidx")
+ )
+ result = sjoin(df1, df2, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {"myidx_left": [1, 2], "geometry": geoms, "myidx_right": ["a", "b"]}
+ )
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {"index_left": [0, 1], "myidx_left": [1, 2], "geometry": geoms},
+ index=pd.Index(["a", "b"], name="myidx_right"),
+ )
+ assert_geodataframe_equal(result, expected)
-@pytest.mark.usefixtures('_setup_class_nybb_filename')
+ result = sjoin(df2, df1, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {"geometry": geoms, "index_right": [0, 1], "myidx_right": [1, 2]},
+ index=pd.Index(["a", "b"], name="myidx_left"),
+ )
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {"myidx_left": ["a", "b"], "myidx_right": [1, 2], "geometry": geoms},
+ )
+ assert_geodataframe_equal(result, expected)
+
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_duplicate_column_index_name_multiindex(self, how):
+ # case where a left column and the right index have the same name or the
+ # other way around -> correctly add suffix or preserve index name
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame({"myidx": [1, 2], "geometry": geoms})
+ df2 = GeoDataFrame(
+ {"geometry": geoms},
+ index=pd.MultiIndex.from_tuples(
+ [("a", 1), ("b", 2)], names=["myidx", "level2"]
+ ),
+ )
+ result = sjoin(df1, df2, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {
+ "myidx_left": [1, 2],
+ "geometry": geoms,
+ "myidx_right": ["a", "b"],
+ "level2": [1, 2],
+ }
+ )
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {"index_left": [0, 1], "myidx_left": [1, 2], "geometry": geoms},
+ index=pd.MultiIndex.from_tuples(
+ [("a", 1), ("b", 2)], names=["myidx_right", "level2"]
+ ),
+ )
+ assert_geodataframe_equal(result, expected)
+
+ result = sjoin(df2, df1, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {"geometry": geoms, "index_right": [0, 1], "myidx_right": [1, 2]},
+ index=pd.MultiIndex.from_tuples(
+ [("a", 1), ("b", 2)], names=["myidx_left", "level2"]
+ ),
+ )
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {
+ "myidx_left": ["a", "b"],
+ "level2": [1, 2],
+ "myidx_right": [1, 2],
+ "geometry": geoms,
+ },
+ )
+ assert_geodataframe_equal(result, expected)
+
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_conflicting_column_index_name(self, how):
+ # test case where the auto-generated index name conflicts
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame({"index_right": [1, 2], "geometry": geoms})
+ df2 = GeoDataFrame({"geometry": geoms})
+ with pytest.raises(ValueError, match="'index_right' cannot be a column name"):
+ sjoin(df1, df2, how=how)
+
+ @pytest.mark.parametrize("how", ["inner", "left", "right"])
+ def test_conflicting_column_with_suffix(self, how):
+ # test case where the auto-generated index name conflicts
+ geoms = [Point(1, 1), Point(2, 2)]
+ df1 = GeoDataFrame(
+ {"column": [1, 2], "column_right": ["a", "b"], "geometry": geoms}
+ )
+ df2 = GeoDataFrame({"column": [0.1, 0.2], "geometry": geoms})
+
+ result = sjoin(df1, df2, how=how)
+ if how in ("inner", "left"):
+ expected = GeoDataFrame(
+ {1: [1, 2], 2: ["a", "b"], 3: geoms, 4: [0, 1], 5: [0.1, 0.2]}
+ )
+ expected.columns = [
+ "column_left",
+ "column_right",
+ "geometry",
+ "index_right",
+ "column_right",
+ ]
+ else:
+ # right join
+ expected = GeoDataFrame(
+ {1: [0, 1], 2: [1, 2], 3: ["a", "b"], 4: [0.1, 0.2], 5: geoms}
+ )
+ expected.columns = [
+ "index_left",
+ "column_left",
+ "column_right",
+ "column_right",
+ "geometry",
+ ]
+ expected = expected.set_geometry("geometry")
+ assert_geodataframe_equal(result, expected)
+
+
+@pytest.mark.usefixtures("_setup_class_nybb_filename")
class TestSpatialJoinNYBB:
+ def setup_method(self):
+ self.polydf = read_file(self.nybb_filename)
+ self.crs = self.polydf.crs
+ N = 20
+ b = [int(x) for x in self.polydf.total_bounds]
+ self.pointdf = GeoDataFrame(
+ [
+ {"geometry": Point(x, y), "pointattr1": x + y, "pointattr2": x - y}
+ for x, y in zip(
+ range(b[0], b[2], int((b[2] - b[0]) / N)),
+ range(b[1], b[3], int((b[3] - b[1]) / N)),
+ )
+ ],
+ crs=self.crs,
+ )
+
+ def test_geometry_name(self):
+ # test sjoin is working with other geometry name
+ polydf_original_geom_name = self.polydf.geometry.name
+ self.polydf = self.polydf.rename(columns={"geometry": "new_geom"}).set_geometry(
+ "new_geom"
+ )
+ assert polydf_original_geom_name != self.polydf.geometry.name
+ res = sjoin(self.polydf, self.pointdf, how="left")
+ assert self.polydf.geometry.name == res.geometry.name
+
+ def test_sjoin_left(self):
+ df = sjoin(self.pointdf, self.polydf, how="left")
+ assert df.shape == (21, 8)
+ for i, row in df.iterrows():
+ assert row.geometry.geom_type == "Point"
+ assert "pointattr1" in df.columns
+ assert "BoroCode" in df.columns
+
+ def test_sjoin_right(self):
+ # the inverse of left
+ df = sjoin(self.pointdf, self.polydf, how="right")
+ df2 = sjoin(self.polydf, self.pointdf, how="left")
+ assert df.shape == (12, 8)
+ assert df.shape == df2.shape
+ for i, row in df.iterrows():
+ assert row.geometry.geom_type == "MultiPolygon"
+ for i, row in df2.iterrows():
+ assert row.geometry.geom_type == "MultiPolygon"
+
+ def test_sjoin_inner(self):
+ df = sjoin(self.pointdf, self.polydf, how="inner")
+ assert df.shape == (11, 8)
+
+ def test_sjoin_predicate(self):
+ # points within polygons
+ df = sjoin(self.pointdf, self.polydf, how="left", predicate="within")
+ assert df.shape == (21, 8)
+ assert df.loc[1]["BoroName"] == "Staten Island"
+
+ # points contain polygons? never happens so we should have nulls
+ df = sjoin(self.pointdf, self.polydf, how="left", predicate="contains")
+ assert df.shape == (21, 8)
+ assert np.isnan(df.loc[1]["Shape_Area"])
+
+ def test_sjoin_bad_predicate(self):
+ # AttributeError: 'Point' object has no attribute 'spandex'
+ with pytest.raises(ValueError):
+ sjoin(self.pointdf, self.polydf, how="left", predicate="spandex")
+
+ def test_sjoin_duplicate_column_name(self):
+ pointdf2 = self.pointdf.rename(columns={"pointattr1": "Shape_Area"})
+ df = sjoin(pointdf2, self.polydf, how="left")
+ assert "Shape_Area_left" in df.columns
+ assert "Shape_Area_right" in df.columns
+
+ @pytest.mark.parametrize("how", ["left", "right", "inner"])
+ def test_sjoin_named_index(self, how):
+ # original index names should be unchanged
+ pointdf2 = self.pointdf.copy()
+ pointdf2.index.name = "pointid"
+ polydf = self.polydf.copy()
+ polydf.index.name = "polyid"
+
+ res = sjoin(pointdf2, polydf, how=how)
+ assert pointdf2.index.name == "pointid"
+ assert polydf.index.name == "polyid"
+
+ # original index name should pass through to result
+ if how == "right":
+ assert res.index.name == "polyid"
+ else: # how == "left", how == "inner"
+ assert res.index.name == "pointid"
+
+ def test_sjoin_values(self):
+ # GH190
+ self.polydf.index = [1, 3, 4, 5, 6]
+ df = sjoin(self.pointdf, self.polydf, how="left")
+ assert df.shape == (21, 8)
+ df = sjoin(self.polydf, self.pointdf, how="left")
+ assert df.shape == (12, 8)
+
+ @pytest.mark.xfail
+ def test_no_overlapping_geometry(self):
+ # Note: these tests are for correctly returning GeoDataFrame
+ # when result of the join is empty
+
+ df_inner = sjoin(self.pointdf.iloc[17:], self.polydf, how="inner")
+ df_left = sjoin(self.pointdf.iloc[17:], self.polydf, how="left")
+ df_right = sjoin(self.pointdf.iloc[17:], self.polydf, how="right")
+
+ expected_inner_df = pd.concat(
+ [
+ self.pointdf.iloc[:0],
+ pd.Series(name="index_right", dtype="int64"),
+ self.polydf.drop("geometry", axis=1).iloc[:0],
+ ],
+ axis=1,
+ )
+
+ expected_inner = GeoDataFrame(expected_inner_df)
+
+ expected_right_df = pd.concat(
+ [
+ self.pointdf.drop("geometry", axis=1).iloc[:0],
+ pd.concat(
+ [
+ pd.Series(name="index_left", dtype="int64"),
+ pd.Series(name="index_right", dtype="int64"),
+ ],
+ axis=1,
+ ),
+ self.polydf,
+ ],
+ axis=1,
+ )
+
+ expected_right = GeoDataFrame(expected_right_df).set_index("index_right")
+
+ expected_left_df = pd.concat(
+ [
+ self.pointdf.iloc[17:],
+ pd.Series(name="index_right", dtype="int64"),
+ self.polydf.iloc[:0].drop("geometry", axis=1),
+ ],
+ axis=1,
+ )
+
+ expected_left = GeoDataFrame(expected_left_df)
+
+ assert expected_inner.equals(df_inner)
+ assert expected_right.equals(df_right)
+ assert expected_left.equals(df_left)
+
+ @pytest.mark.skip("Not implemented")
+ def test_sjoin_outer(self):
+ df = sjoin(self.pointdf, self.polydf, how="outer")
+ assert df.shape == (21, 8)
- @pytest.mark.parametrize('predicate', ['intersects', 'within', 'contains'])
+ def test_sjoin_empty_geometries(self):
+ # https://github.com/geopandas/geopandas/issues/944
+ empty = GeoDataFrame(geometry=[GeometryCollection()] * 3, crs=self.crs)
+ df = sjoin(pd.concat([self.pointdf, empty]), self.polydf, how="left")
+ assert df.shape == (24, 8)
+ df2 = sjoin(self.pointdf, pd.concat([self.polydf, empty]), how="left")
+ assert df2.shape == (21, 8)
+
+ @pytest.mark.parametrize("predicate", ["intersects", "within", "contains"])
def test_sjoin_no_valid_geoms(self, predicate):
"""Tests a completely empty GeoDataFrame."""
- pass
+ empty = GeoDataFrame(geometry=[], crs=self.pointdf.crs)
+ assert sjoin(self.pointdf, empty, how="inner", predicate=predicate).empty
+ assert sjoin(self.pointdf, empty, how="right", predicate=predicate).empty
+ assert sjoin(empty, self.pointdf, how="inner", predicate=predicate).empty
+ assert sjoin(empty, self.pointdf, how="left", predicate=predicate).empty
+
+ def test_empty_sjoin_return_duplicated_columns(self, nybb_filename):
+ nybb = geopandas.read_file(nybb_filename)
+ nybb2 = nybb.copy()
+ nybb2.geometry = nybb2.translate(200000) # to get non-overlapping
+
+ result = geopandas.sjoin(nybb, nybb2)
+
+ assert "BoroCode_right" in result.columns
+ assert "BoroCode_left" in result.columns
+
+
+@pytest.fixture
+def world(naturalearth_lowres):
+ return read_file(naturalearth_lowres)
+
+
+@pytest.fixture
+def cities(naturalearth_cities):
+ return read_file(naturalearth_cities)
+
+
+def test_sjoin_inner(world, cities):
+ # GH637
+ countries = world[["geometry", "name"]]
+ countries = countries.rename(columns={"name": "country"})
+ cities_with_country = sjoin(cities, countries, how="inner", predicate="intersects")
+ assert cities_with_country.shape == (213, 4)
class TestNearest:
- pass
+ @pytest.mark.parametrize(
+ "how_kwargs", ({}, {"how": "inner"}, {"how": "left"}, {"how": "right"})
+ )
+ def test_allowed_hows(self, how_kwargs):
+ left = geopandas.GeoDataFrame({"geometry": []})
+ right = geopandas.GeoDataFrame({"geometry": []})
+ sjoin_nearest(left, right, **how_kwargs) # no error
+
+ @pytest.mark.parametrize("how", ("outer", "abcde"))
+ def test_invalid_hows(self, how: str):
+ left = geopandas.GeoDataFrame({"geometry": []})
+ right = geopandas.GeoDataFrame({"geometry": []})
+ with pytest.raises(ValueError, match="`how` was"):
+ sjoin_nearest(left, right, how=how)
+
+ @pytest.mark.parametrize("distance_col", (None, "distance"))
+ def test_empty_right_df_how_left(self, distance_col: str):
+ # all records from left and no results from right
+ left = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ right = geopandas.GeoDataFrame({"geometry": []})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how="left",
+ distance_col=distance_col,
+ )
+ assert_geoseries_equal(joined["geometry"], left["geometry"])
+ assert joined["index_right"].isna().all()
+ if distance_col is not None:
+ assert joined[distance_col].isna().all()
+
+ @pytest.mark.parametrize("distance_col", (None, "distance"))
+ def test_empty_right_df_how_right(self, distance_col: str):
+ # no records in joined
+ left = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ right = geopandas.GeoDataFrame({"geometry": []})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how="right",
+ distance_col=distance_col,
+ )
+ assert joined.empty
+ if distance_col is not None:
+ assert distance_col in joined
+
+ @pytest.mark.parametrize("how", ["inner", "left"])
+ @pytest.mark.parametrize("distance_col", (None, "distance"))
+ def test_empty_left_df(self, how, distance_col: str):
+ right = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ left = geopandas.GeoDataFrame({"geometry": []})
+ joined = sjoin_nearest(left, right, how=how, distance_col=distance_col)
+ assert joined.empty
+ if distance_col is not None:
+ assert distance_col in joined
+
+ @pytest.mark.parametrize("distance_col", (None, "distance"))
+ def test_empty_left_df_how_right(self, distance_col: str):
+ right = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ left = geopandas.GeoDataFrame({"geometry": []})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how="right",
+ distance_col=distance_col,
+ )
+ assert_geoseries_equal(joined["geometry"], right["geometry"])
+ assert joined["index_left"].isna().all()
+ if distance_col is not None:
+ assert joined[distance_col].isna().all()
+
+ @pytest.mark.parametrize("how", ["inner", "left"])
+ def test_empty_join_due_to_max_distance(self, how):
+ # after applying max_distance the join comes back empty
+ # (as in NaN in the joined columns)
+ left = geopandas.GeoDataFrame({"geometry": [Point(0, 0)]})
+ right = geopandas.GeoDataFrame({"geometry": [Point(1, 1), Point(2, 2)]})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how=how,
+ max_distance=1,
+ distance_col="distances",
+ )
+ expected = left.copy()
+ expected["index_right"] = [np.nan]
+ expected["distances"] = [np.nan]
+ if how == "inner":
+ expected = expected.dropna()
+ expected["index_right"] = expected["index_right"].astype("int64")
+ assert_geodataframe_equal(joined, expected)
+
+ def test_empty_join_due_to_max_distance_how_right(self):
+ # after applying max_distance the join comes back empty
+ # (as in NaN in the joined columns)
+ left = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ right = geopandas.GeoDataFrame({"geometry": [Point(2, 2)]})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how="right",
+ max_distance=1,
+ distance_col="distances",
+ )
+ expected = right.copy()
+ expected["index_left"] = [np.nan]
+ expected["distances"] = [np.nan]
+ expected = expected[["index_left", "geometry", "distances"]]
+ assert_geodataframe_equal(joined, expected)
+
+ @pytest.mark.parametrize("how", ["inner", "left"])
+ def test_max_distance(self, how):
+ left = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ right = geopandas.GeoDataFrame({"geometry": [Point(1, 1), Point(2, 2)]})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how=how,
+ max_distance=1,
+ distance_col="distances",
+ )
+ expected = left.copy()
+ expected["index_right"] = [np.nan, 0]
+ expected["distances"] = [np.nan, 0]
+ if how == "inner":
+ expected = expected.dropna()
+ expected["index_right"] = expected["index_right"].astype("int64")
+ assert_geodataframe_equal(joined, expected)
+
+ def test_max_distance_how_right(self):
+ left = geopandas.GeoDataFrame({"geometry": [Point(1, 1), Point(2, 2)]})
+ right = geopandas.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ joined = sjoin_nearest(
+ left,
+ right,
+ how="right",
+ max_distance=1,
+ distance_col="distances",
+ )
+ expected = right.copy()
+ expected["index_left"] = [np.nan, 0]
+ expected["distances"] = [np.nan, 0]
+ expected = expected[["index_left", "geometry", "distances"]]
+ assert_geodataframe_equal(joined, expected)
+
+ @pytest.mark.parametrize("how", ["inner", "left"])
+ @pytest.mark.parametrize(
+ "geo_left, geo_right, expected_left, expected_right, distances",
+ [
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1)],
+ [0, 1],
+ [0, 0],
+ [math.sqrt(2), 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0)],
+ [0, 1],
+ [1, 0],
+ [0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0), Point(0, 0)],
+ [0, 0, 1],
+ [1, 2, 0],
+ [0, 0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0), Point(2, 2)],
+ [0, 1],
+ [1, 0],
+ [0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0.25, 1)],
+ [0, 1],
+ [1, 0],
+ [math.sqrt(0.25**2 + 1), 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(-10, -10), Point(100, 100)],
+ [0, 1],
+ [0, 0],
+ [math.sqrt(10**2 + 10**2), math.sqrt(11**2 + 11**2)],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(x, y) for x, y in zip(np.arange(10), np.arange(10))],
+ [0, 1],
+ [0, 1],
+ [0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1), Point(0, 0)],
+ [Point(1.1, 1.1), Point(0, 0)],
+ [0, 1, 2],
+ [1, 0, 1],
+ [0, np.sqrt(0.1**2 + 0.1**2), 0],
+ ),
+ ],
+ )
+ def test_sjoin_nearest_left(
+ self,
+ geo_left,
+ geo_right,
+ expected_left: Sequence[int],
+ expected_right: Sequence[int],
+ distances: Sequence[float],
+ how,
+ ):
+ left = geopandas.GeoDataFrame({"geometry": geo_left})
+ right = geopandas.GeoDataFrame({"geometry": geo_right})
+ expected_gdf = left.iloc[expected_left].copy()
+ expected_gdf["index_right"] = expected_right
+ # without distance col
+ joined = sjoin_nearest(left, right, how=how)
+ # inner / left join give a different row order
+ check_like = how == "inner"
+ assert_geodataframe_equal(expected_gdf, joined, check_like=check_like)
+ # with distance col
+ expected_gdf["distance_col"] = np.array(distances, dtype=float)
+ joined = sjoin_nearest(left, right, how=how, distance_col="distance_col")
+ assert_geodataframe_equal(expected_gdf, joined, check_like=check_like)
+
+ @pytest.mark.parametrize(
+ "geo_left, geo_right, expected_left, expected_right, distances",
+ [
+ ([Point(0, 0), Point(1, 1)], [Point(1, 1)], [1], [0], [0]),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0)],
+ [1, 0],
+ [0, 1],
+ [0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0), Point(0, 0)],
+ [1, 0, 0],
+ [0, 1, 2],
+ [0, 0, 0],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0, 0), Point(2, 2)],
+ [1, 0, 1],
+ [0, 1, 2],
+ [0, 0, math.sqrt(2)],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(1, 1), Point(0.25, 1)],
+ [1, 1],
+ [0, 1],
+ [0, 0.75],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(-10, -10), Point(100, 100)],
+ [0, 1],
+ [0, 1],
+ [math.sqrt(10**2 + 10**2), math.sqrt(99**2 + 99**2)],
+ ),
+ (
+ [Point(0, 0), Point(1, 1)],
+ [Point(x, y) for x, y in zip(np.arange(10), np.arange(10))],
+ [0, 1] + [1] * 8,
+ list(range(10)),
+ [0, 0] + [np.sqrt(x**2 + x**2) for x in np.arange(1, 9)],
+ ),
+ (
+ [Point(0, 0), Point(1, 1), Point(0, 0)],
+ [Point(1.1, 1.1), Point(0, 0)],
+ [1, 0, 2],
+ [0, 1, 1],
+ [np.sqrt(0.1**2 + 0.1**2), 0, 0],
+ ),
+ ],
+ )
+ def test_sjoin_nearest_right(
+ self,
+ geo_left,
+ geo_right,
+ expected_left: Sequence[int],
+ expected_right: Sequence[int],
+ distances: Sequence[float],
+ ):
+ left = geopandas.GeoDataFrame({"geometry": geo_left})
+ right = geopandas.GeoDataFrame({"geometry": geo_right})
+ expected_gdf = right.iloc[expected_right].copy()
+ expected_gdf["index_left"] = expected_left
+ expected_gdf = expected_gdf[["index_left", "geometry"]]
+ # without distance col
+ joined = sjoin_nearest(left, right, how="right")
+ assert_geodataframe_equal(expected_gdf, joined)
+ # with distance col
+ expected_gdf["distance_col"] = np.array(distances, dtype=float)
+ joined = sjoin_nearest(left, right, how="right", distance_col="distance_col")
+ assert_geodataframe_equal(expected_gdf, joined)
+
+ @pytest.mark.filterwarnings("ignore:Geometry is in a geographic CRS")
+ def test_sjoin_nearest_inner(self, naturalearth_lowres, naturalearth_cities):
+ # check equivalency of left and inner join
+ countries = read_file(naturalearth_lowres)
+ cities = read_file(naturalearth_cities)
+ countries = countries[["geometry", "name"]].rename(columns={"name": "country"})
+
+ # default: inner and left give the same result
+ result1 = sjoin_nearest(cities, countries, distance_col="dist")
+ assert result1.shape[0] == cities.shape[0]
+ result2 = sjoin_nearest(cities, countries, distance_col="dist", how="inner")
+ assert_geodataframe_equal(result2, result1)
+ result3 = sjoin_nearest(cities, countries, distance_col="dist", how="left")
+ assert_geodataframe_equal(result3, result1, check_like=True)
+
+ # with max_distance: rows that go above are dropped in case of inner
+ result4 = sjoin_nearest(cities, countries, distance_col="dist", max_distance=1)
+ assert_geodataframe_equal(
+ result4, result1[result1["dist"] < 1], check_like=True
+ )
+ result5 = sjoin_nearest(
+ cities, countries, distance_col="dist", max_distance=1, how="left"
+ )
+ assert result5.shape[0] == cities.shape[0]
+ result5 = result5.dropna()
+ result5["index_right"] = result5["index_right"].astype("int64")
+ assert_geodataframe_equal(result5, result4, check_like=True)
+
+ @pytest.mark.parametrize(
+ "max_distance,expected", [(None, [1, 3, 3, 1, 2]), (1.1, [3, 3, 1, 2])]
+ )
+ def test_sjoin_nearest_exclusive(self, max_distance, expected):
+ geoms = shapely.points(np.arange(3), np.arange(3))
+ geoms = np.append(geoms, [Point(1, 2)])
+
+ df = geopandas.GeoDataFrame({"geometry": geoms})
+ result = df.sjoin_nearest(
+ df, max_distance=max_distance, distance_col="dist", exclusive=True
+ )
+
+ assert_series_equal(
+ result["index_right"].reset_index(drop=True),
+ pd.Series(expected),
+ check_names=False,
+ )
+
+ if max_distance:
+ assert result["dist"].max() <= max_distance
diff --git a/geopandas/tools/tests/test_tools.py b/geopandas/tools/tests/test_tools.py
index a51978e3..603aad0d 100644
--- a/geopandas/tools/tests/test_tools.py
+++ b/geopandas/tools/tests/test_tools.py
@@ -1,8 +1,51 @@
from shapely.geometry import LineString, MultiPoint, Point
+
from geopandas import GeoSeries
from geopandas.tools import collect
+
import pytest
class TestTools:
- pass
+ def setup_method(self):
+ self.p1 = Point(0, 0)
+ self.p2 = Point(1, 1)
+ self.p3 = Point(2, 2)
+ self.mpc = MultiPoint([self.p1, self.p2, self.p3])
+
+ self.mp1 = MultiPoint([self.p1, self.p2])
+ self.line1 = LineString([(3, 3), (4, 4)])
+
+ def test_collect_single(self):
+ result = collect(self.p1)
+ assert self.p1.equals(result)
+
+ def test_collect_single_force_multi(self):
+ result = collect(self.p1, multi=True)
+ expected = MultiPoint([self.p1])
+ assert expected.equals(result)
+
+ def test_collect_multi(self):
+ result = collect(self.mp1)
+ assert self.mp1.equals(result)
+
+ def test_collect_multi_force_multi(self):
+ result = collect(self.mp1)
+ assert self.mp1.equals(result)
+
+ def test_collect_list(self):
+ result = collect([self.p1, self.p2, self.p3])
+ assert self.mpc.equals(result)
+
+ def test_collect_GeoSeries(self):
+ s = GeoSeries([self.p1, self.p2, self.p3])
+ result = collect(s)
+ assert self.mpc.equals(result)
+
+ def test_collect_mixed_types(self):
+ with pytest.raises(ValueError):
+ collect([self.p1, self.line1])
+
+ def test_collect_mixed_multi(self):
+ with pytest.raises(ValueError):
+ collect([self.mpc, self.mp1])
diff --git a/geopandas/tools/util.py b/geopandas/tools/util.py
index bea1b2db..5d4c507e 100644
--- a/geopandas/tools/util.py
+++ b/geopandas/tools/util.py
@@ -1,8 +1,13 @@
import pandas as pd
+
from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon
from shapely.geometry.base import BaseGeometry
-_multi_type_map = {'Point': MultiPoint, 'LineString': MultiLineString,
- 'Polygon': MultiPolygon}
+
+_multi_type_map = {
+ "Point": MultiPoint,
+ "LineString": MultiLineString,
+ "Polygon": MultiPolygon,
+}
def collect(x, multi=False):
@@ -18,4 +23,23 @@ def collect(x, multi=False):
only have one component.
"""
- pass
+ if isinstance(x, BaseGeometry):
+ x = [x]
+ elif isinstance(x, pd.Series):
+ x = list(x)
+
+ # We cannot create GeometryCollection here so all types
+ # must be the same. If there is more than one element,
+ # they cannot be Multi*, i.e., can't pass in combination of
+ # Point and MultiPoint... or even just MultiPoint
+ t = x[0].geom_type
+ if not all(g.geom_type == t for g in x):
+ raise ValueError("Geometry type must be homogeneous")
+ if len(x) > 1 and t.startswith("Multi"):
+ raise ValueError("Cannot collect {0}. Must have single geometries".format(t))
+
+ if len(x) == 1 and (t.startswith("Multi") or not multi):
+ # If there's only one single part geom and we're not forcing to
+ # multi, then just return it
+ return x[0]
+ return _multi_type_map[t](x)