Hide code cell source
%load_ext watermark
import pandas as pd
import numpy as np
from typing import Type, Optional, Callable
from typing import List, Dict, Union, Tuple

import sklearn.linear_model
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns

import setvariables as conf_
import reportclass as r_class

import unittest

Land use class#

The LandUse class takes the land-use, land-cover and streets data and merges them to a single dataframe where the index is the survey location and the columns are the scaled values of the attributes measured in QGIS. The land-cover and land-use data is given as a % of the available dry land attributed to a specific label. The streets are given as the length in meters.

The default scaling method is Min-Max:

def scale_a_column(df: pd.DataFrame, column_to_scale: str, column_name: str = 'length'):    

    # Calculate the minimum and maximum values in the column
    min_value = df[column_to_scale].min()
    max_value = df[column_to_scale].max()

    # Perform min-max scaling on a temp column
    df['scalex'] = (df[column_to_scale] - min_value) / (max_value - min_value)
    # reassign the value
    df[column_name] = df['scalex']
    # drop the temp
    df.drop('scalex', axis=1, inplace=True)
    return df

Why is this important?#

Because it is another way to get proxies for usage and population.

We assume there is a relationship between how the land is used and what it is we find on the ground. Archeaologists and Anthropologists make this basic assumption every time they undertake an excavation and interpret the results in the context of other findings. This interpretation of beach litter data does exactly the same. As discussed in Near or far and the federal report IQAASL at the national level there is strong evidence to support a correlation between the density of objects found and specific topographic features that can be isolated on a standard topographical map.

What is important?#

The relationship between the topograhpical features and the density of the objects found and scale at which that relationship is described.

However, the measured features are not independent of each other. For example if their are buildings in an area we expect to also find a road that leads to those buildings. This multicolinearity can lead to unstable coefficient estimates and make it challenging to interpret the individual effects of the correlated variables on the target variable.

The scale for which the results are valid is important for planning purposes. We consider that the scale of the use of the data should be comparable to the scale of the lowest administrative unit that is responsible. It may be tempting to look at this as a rating to system. Thus pointing the finger at the locations that have the highest density. It may be better to interpret this as a list to help prioritise resource allocation.

The reliability and the continuity of the data and analysis.

The collected data is a diverse combination of different interpretations of the same protocol. The reasons for this is because there is no centralized authority that requires the data be collected in a specific way. The basic assumption is that all stakeholders did the best they could within the limits of the resources at hand and the experience of their respective teams. From this we conclude that the surveys are a resonable description of the minimum number of objects found at the defined survey location within the limits of the length recorded. There are many ways that survey consistency can improve, the easiest and fastet is: i. Defining a standard method, ii. allocating resources to collect the data using the defined method.

The topographical data from the confederation provides continuity to what could be interpreted as unrelated observations. Furthermore, the labels provided for the various topographical features are indicators of use and have a real meaning to georaphers and engineers in planning and development. Local associations that are involved in preventing and reducing litter may also be interested.

Using the land use class#

To start a LandUse class, first intiate a ReportClass for the boundaries of interest. In this example all lakes have been selected for all results between 2015 and 2022.

# ! USER DEFINED INPUT
# Temporal and geographic boundaries.
boundaries = dict(feature_type='l', language='fr', start_date='2015-01-01', end_date='2022-01-01')

# pass request to API
# Make the report data and report
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)
a_report = r_class.ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)

The locations from the ReportClass are used to select the relevant geo-data. Survey locations that have no geo-data are dropped from the analysis, a list of dropped locations is produced.

# start land use class
w_df_locations = w_df.slug.unique()

# call the land use class on the two different location groups
m_ui = LandUse(land_cover, land_use, streets, w_df_locations)

# make a copy of the land use data
lcui = m_ui.use_of_land.copy()

# identify locations with no topo data and merge
# the results to the available topo data that means
# the locations with no topo data will not be considered
# in the analysis
lc_sti, no_datai = match_topo_attributes_to_surveys(lcui, a_report.w_df)

Merge the validated geo-data with the survey results and scale the pcs_m column of the survey data.


# the basic work data contains the survey results and the 
# topographical data merged on the <slug> column
work_data_i = merge_topodata_and_surveydata(lc_sti, a_report.w_df)

# we are interested in the most common objects from the region of interest
# select the most common codes from the report class
# w_data_mc = work_data_i[work_data_i.code.isin(a_report.most_common.index)].copy()

# select an object from the most common:
selected = ["G27"]
w_datai = work_data_i[work_data_i.code.isin(selected)].copy()

# scale the value column
w_datai = scale_a_column(w_datai, column_to_scale='pcs_m', column_name="scaled")

The code for the LandUse class is below:

Hide code cell source
def match_topo_attributes_to_surveys(topo_data: pd.DataFrame, survey_data: pd.DataFrame)-> Tuple[pd.DataFrame,List]:
    """
    Match topographic attributes to survey data for specific locations.

    This function takes topographic attribute data and survey data and matches them based on the unique locations (slugs).
    
    Parameters:
        topo_data (pd.DataFrame): A DataFrame containing topographic attribute data.
        survey_data (pd.DataFrame): A DataFrame containing survey data.

    Returns:
        Tuple[pd.DataFrame, List]: A tuple containing two elements:
            - A DataFrame containing topographic attribute data for locations found in both datasets.
            - A list of locations (slugs) from the survey data for which there is no matching topographic data.

    """

    locations = survey_data.slug.unique()
    available = topo_data.index
    # identify the locations that have no topo data
    no_data = [x for x in locations if x not in available]

    # take the available data and names of locations with no data
    locations_with_data = [x for x in locations if x in available]
    
    return topo_data.loc[locations_with_data], no_data

def merge_topodata_and_surveydata(topo, surveys, columns: List[str] = conf_.work_columns)-> pd.DataFrame:
    """
    Merge survey data with topographic data using location information.

    This function merges survey data with topographic data using the 'slug' column in the survey data
    and the index of the topographic data. The merged DataFrame will contain the specified columns from the survey data.

    Parameters:
        topo (pd.DataFrame): A DataFrame containing topographic data with the location index.
        surveys (pd.DataFrame): A DataFrame containing survey data with a 'slug' column for location matching.
        columns (List[str]): A list of column names to include in the merged DataFrame (default is defined in conf.work_columns).

    Returns:
        pd.DataFrame: A merged DataFrame containing the specified survey data columns and topographic data.
    """
    # merges surveys to topo using the slug column in surveys
    # and the index in topo
    return surveys[columns].merge(topo, left_on='slug', right_index=True)

def scale_a_column(df: pd.DataFrame, column_to_scale: str, column_name: str = 'length'):    

    # Calculate the minimum and maximum values in the column
    min_value = df[column_to_scale].min()
    max_value = df[column_to_scale].max()

    # Perform min-max scaling on a temp column
    df['scalex'] = (df[column_to_scale] - min_value) / (max_value - min_value)
    # reassign the value
    df[column_name] = df['scalex']
    # drop the temp
    df.drop('scalex', axis=1, inplace=True)
    return df

def group_topographic_attributes(df: pd.DataFrame, list_of_labels: List = None, locations: List = None, coi: str = 'scale')-> pd.DataFrame:
    """
    Group and aggregate topographic attributes in a DataFrame.

    This function groups and aggregates topographic attributes in the provided DataFrame. You can specify a list of labels
    to group attributes, filter locations, and choose the column of interest for aggregation.

    Parameters:
        df (pd.DataFrame): A DataFrame containing topographic attributes.
        list_of_labels (List, optional): A list of dictionaries with keys as new attribute names and values as properties to group.
        locations (List, optional): A list of locations to filter the data (default is None, no location filtering).
        coi (str, optional): The column of interest for aggregation (default is 'scale').

    Returns:
        pd.DataFrame: A DataFrame with aggregated topographic attributes based on the specified grouping and filtering.
   """
    
    if locations is not None:
        df = df.loc[df.slug.isin(locations)]    
    # list of labels is a list of dictionaries
    if list_of_labels is not None:
        for new_labels in list_of_labels:
            # the attributes, the dictionary values are 
            # properties being grouped
            attributes = list(new_labels.values())
            # the dictionary key is the new name of
            # the attributes in the list
            new_val = list(new_labels.keys())
            df.loc[df['attribute'].isin(attributes[0]), 'attribute'] = new_val[0]
    # sum the occurrences of the same attribute
    r = df.groupby(['slug','attribute'], as_index=False)[coi].sum()

    # pivot and set the index to the locations
    # have the attributes
    r = r.pivot(columns='attribute', index='slug')            
            
    return r.droplevel(0, axis=1).fillna(0)

def statistic_of_critical_value(df, 
                                df_feature_columns, 
                                df_target_column, 
                                sample_id: str = 'loc_date',
                                value_counts: bool = True,
                                average: bool = False):
    """
    Compute statistics of critical values for given data.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    df_feature_columns (list): A list of columns to be used as feature columns.
    df_target_column (list): A list of columns to be used as target columns.
    sample_id (str, optional): The column representing sample identifiers (default is 'loc_date').
    value_counts (bool, optional): If True, compute value counts as weights (default is True).
    average (bool, optional): If True, compute the median for the feature columns (default is False).

    Returns:
    pd.DataFrame: A DataFrame containing computed statistics based on the specified options.
    """
    d = pd.melt(df, value_vars=df_feature_columns, id_vars=[df_target_column, sample_id])
    
    if value_counts:
        di = d.groupby('variable', as_index=False)['value'].value_counts()
        di['weight'] = di['count']/d[sample_id].nunique()
        di = di.pivot(columns='variable', index='value', values='weight')
    if average:        
        di = d.groupby(['variable', 'value'], as_index=False)[df_target_column].median()
        di = di.pivot(columns='variable', index='value', values=df_target_column)
                
    return di

class LandUse:
    """
    A class for analyzing and transforming land use data.

    This class provides methods to analyze land cover, land use, and transportation data.
    It allows you to group attributes, scale data, and create ordinal rankings based on quantiles.

    Parameters:
        land_cover (pd.DataFrame): DataFrame containing land cover data.
        land_use (pd.DataFrame): DataFrame containing land use data.
        transportation (pd.DataFrame): DataFrame containing transportation data.
        locations (List): List of locations for filtering data.
        street_groups (List, optional): List of street groups (default is from configuration).
        land_use_groups (List, optional): List of land use groups (default is from configuration).

    Attributes:
        quantiles (List): List of quantile values for ordinal ranking.
        labels (List): List of labels corresponding to quantile groups.

    Properties:
        - land_cover: Grouped and aggregated land cover data.
        - land_use: Grouped and aggregated land use data.
        - trans: Grouped and aggregated transportation data.
        - use_of_land: Combined data with the option to scale the columns between 0 and 1.
        - ordinal_land_rank: Ordinal ranking based on quantiles for land use data.

    Example:
        land_use_data = LandUse(land_cover_data, land_use_data, transportation_data, locations)
        land_cover = land_use_data.land_cover
        land_use = land_use_data.land_use
        trans_data = land_use_data.trans(new_labels=[{'new_attr': ['attr1', 'attr2']}])
        land_rankings = land_use_data.ordinal_land_rank
    """
    street_groups = conf_.street_groups
    land_use_groups = conf_.lu_groups

    def __init__(self, land_cover, land_use, transportation, locations, street_groups=street_groups, land_use_groups=land_use_groups):
        self.lc = land_cover
        self.lu = land_use
        self.tr = transportation
        self.locations = locations
        self.lug = land_use_groups
        self.stg = street_groups
        self.quantiles = [0.0, 0.03, 0.25, 0.75, 0.97, 1.0]
        self. labels = ['lowest', 'low', 'middle', 'high', 'highest']
        
    @property
    def land_cover(self, list_of_labels=None):
        return group_topographic_attributes(self.lc, locations=self.locations, list_of_labels=list_of_labels)

    @property
    def land_use(self, new_labels=None):
        if new_labels is not None:
            return group_topographic_attributes(self.lu, locations=self.locations, list_of_labels=new_labels)
        else:
            return group_topographic_attributes(self.lu, locations=self.locations, list_of_labels=self.lug)
    
    @property
    def trans(self,new_labels=None):
        if new_labels is not None:
            return group_topographic_attributes(self.tr, locations=self.locations, list_of_labels=new_labels, coi='length')
        else:
            return group_topographic_attributes(self.tr, locations=self.locations, list_of_labels=self.stg, coi='length')

    @property
    def use_of_land(self, scaled: bool = True):
        a = self.land_cover.merge(self.land_use, left_index=True, right_index=True)
        b = a.merge(self.trans, left_index=True, right_index=True)
        
        if scaled:
            self.scaler = MinMaxScaler()
            scaled_data = self.scaler.fit_transform(b)
            b = pd.DataFrame(scaled_data, columns=b.columns, index=b.index)
        return b
    
    @property
    def ordinal_land_rank(self):
        ranked_df = self.use_of_land.copy()
        columns_to_rank = ranked_df.columns
        for column in columns_to_rank:
            label = f'{column}_ordinal_rank'
            ranked_df[label] = pd.cut(ranked_df[column], bins=self.quantiles, labels=self.labels, include_lowest=True)
            ranked_df[column] = ranked_df[label]
            ranked_df.drop(label, inplace=True, axis=1)
        return ranked_df

Land use and density#

w_datai.feature_type.unique()
array(['l', 'r', 'p'], dtype=object)
_images/55898d07a2f55ffbea72d2905809e70e263d13cef736969f83615bf882e8860e.png
The weight of each category in the feauture variables. For example: the component Forest is 5% composed of locations that fall within the lowest amount dedicated to forests.
  Verger Vignes Bâtiments Forêt Services Publics Rues Non Défini
Le Plus Bas 0,91 0,82 0,05 0,05 0,23 0,64 0,05
Faible 0,00 0,05 0,18 0,14 0,59 0,05 0,73
Milieu 0,05 0,09 0,73 0,14 0,14 0,27 0,18
Élévé 0,00 0,00 0,00 0,64 0,00 0,00 0,00
Le Plus Élévé 0,05 0,05 0,05 0,05 0,05 0,05 0,05
The median pcs/m of each category in the feature variables. For example: The average pcs/m at the lowest category of forests was 0.00
  Verger Vignes Bâtiments Forêt Services Publics Rues Non Défini
Le Plus Bas 0,11 0,17 0,03 0,00 0,00 0,17 0,04
Faible nan 0,04 0,00 0,04 0,21 0,24 0,17
Milieu 0,04 0,00 0,22 0,00 0,09 0,00 0,06
Élévé nan nan nan 0,17 nan nan nan
Le Plus Élévé 0,00 0,00 0,04 0,24 0,04 0,04 0,00

Locations with no landuse data are excluded.

Unit tests#

Grouping topographical data#

Hide code cell source
class TestGroupTopographicAttributes(unittest.TestCase):

    def test_group_topographic_attributes_no_grouping(self):
        # sample DataFrame for testing
        data = {
            'slug': ['Location1', 'Location2', 'Location1', 'location3', 'Location2'],
            'attribute': ['A', 'B', 'A','C', 'A'],
            'scale': [10, 20, 5, 5, 5]
        }
        df = pd.DataFrame(data)        

        # Test without grouping
        result = group_topographic_attributes(df)
        result.reset_index(drop=False, inplace=True)
        result.columns.name = None
                
        # expected result
        expected = pd.DataFrame({
            'A': {'Location1': 15.0, 'Location2': 5.0, 'location3': 0.0},
            'B': {'Location1': 0.0, 'Location2': 20.0, 'location3': 0.0},
            'C': {'Location1': 0.0, 'Location2': 0.0, 'location3': 5.0}})
        expected.index.name = "slug"
        expected.reset_index(drop=False, inplace=True)
        # Assert 
        pd.testing.assert_frame_equal(result, expected)

    def test_group_topographic_attributes_with_grouping(self):
        # Create a sample DataFrame for testing
        data = {
            'slug': ['Location1', 'Location2', 'Location1', 'Location3', 'Location1'],
            'attribute': ['PropertyA', 'PropertyB', 'PropertyA', 'PropertyC', 'PropertyB'],
            'scale': [10, 20, 5, 5, 10]
        }
        df = pd.DataFrame(data)   

        # Test with grouping
        labels = [{'NewPropertyA': ['PropertyA', 'PropertyB']}]
        result = group_topographic_attributes(df, list_of_labels=labels)
        result.index.name = None
        result.columns.name = None
                
        # Define your expected result based on the input data and grouping
        expected = pd.DataFrame({
            'NewPropertyA': {'Location1': 25.0, 'Location2': 20.0, 'Location3': 0.0},
            'PropertyC': {'Location1': 0.0, 'Location2': 0.0, 'Location3': 5.0}})
        
        # Assert that the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)
        
test_suite = unittest.TestLoader().loadTestsFromTestCase(TestGroupTopographicAttributes)
test_runner = unittest.TextTestRunner(verbosity=2)
test_result = test_runner.run(test_suite)
test_group_topographic_attributes_no_grouping (__main__.TestGroupTopographicAttributes) ... ok
test_group_topographic_attributes_with_grouping (__main__.TestGroupTopographicAttributes) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.015s

OK

Statistic of critical value#

Hide code cell source
class TestCriticalValue(unittest.TestCase):

    def test_value_counts(self):
        data = {'sample_id': ['A', 'A', 'B', 'B', 'C'],
                'feature1': [1, 2, 1, 3, 1],
                'feature2': [2, 3, 2, 4, 2],
                'target1': [5, 6, 7, 8, 9]}

        df = pd.DataFrame(data)
        df_feature_columns = ['feature1', 'feature2']
        df_target_column = 'target1'
        expected_result = pd.DataFrame({
            'feature1': {1: 1.0, 2: 0.33, 3: 0.33},
            'feature2': {2: 1.0, 3: 0.33, 4: 0.33}
        }, index = [1,2,3,4])
        expected_result.index.name = None
      
        result = statistic_of_critical_value(df, df_feature_columns, df_target_column, sample_id='sample_id')
        result.index.name = None
        result.columns.name = None
        result = result.round(2)
       
        
        pd.testing.assert_frame_equal(result, expected_result)

    def test_statistic_median(self):
        data = {'sample_id': ['A', 'A', 'B', 'B', 'C'],
                'feature1': [1, 2, 1, 3, 1],
                'feature2': [2, 3, 2, 4, 2],
                'target': [5, 6, 7, 8, 9]}

        df = pd.DataFrame(data)
        df_feature_columns = ['feature1', 'feature2']
        df_target_column = 'target'
        expected_result = pd.DataFrame({
            'feature1': {1: 7, 2: 6, 3: 8},
            'feature2': {2: 7, 3: 6, 4: 8}
        })

        result = statistic_of_critical_value(df, df_feature_columns, df_target_column, sample_id='sample_id', average=True)
        result.index.name = None
        result.columns.name = None
        pd.testing.assert_frame_equal(result, expected_result)


test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCriticalValue)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)
test_statistic_median (__main__.TestCriticalValue) ... ok
test_value_counts (__main__.TestCriticalValue) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.021s

OK
Author: hammerdirt-analyst

conda environment: cantonal_report

statsmodels: 0.14.0
matplotlib : 3.7.1
scipy      : 1.11.2
seaborn    : 0.12.2
sklearn    : 1.3.0
pandas     : 2.0.3
numpy      : 1.25.2