Skip to content

API Reference

DHSBaseAPI dataclass

Base Class to fetch data from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
filter_fields list

List of fields to filter the data.

list()
Source code in pdhs/base_api.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@dataclass
class DHSBaseAPI:

    """
    Base Class to fetch data from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        filter_fields (list): List of fields to filter the data.

    """
    _url_extension: str
    country_ids: List[str] = field(default_factory=list)
    indicator_ids: List[str] = field(default_factory=list)
    survey_ids: List[str] = field(default_factory=list)
    survey_year: List[str] = field(default_factory=list)
    survey_year_start: List[str] = field(default_factory=list)
    survey_year_end: List[str] = field(default_factory=list)
    survey_type: List[str] = field(default_factory=list)
    survey_characteristics_ids: List[str] = field(default_factory=list)
    tagIds: List[str] = field(default_factory=list)
    filter_fields: List[str] = field(default_factory=list)  # Fields to filter
    _timeout: int = 10  # Request timeout in seconds


    def __post_init__(self):
        """Construct the API URL after initialization."""
        self.url = (
            f"http://api.dhsprogram.com/rest/dhs/{self._url_extension}?"
            f"surveyYears={self._convert_query_list_to_string(self.survey_year)}&"
            f"countryIds={self._convert_query_list_to_string(self.country_ids)}&"
            f"indicatorIds={self._convert_query_list_to_string(self.indicator_ids)}&"
            f"surveyYearIds={self._convert_query_list_to_string(self.survey_ids)}&"
            f"surveyYearStarts={self._convert_query_list_to_string(self.survey_year_start)}&"
            f"surveyYearEnds={self._convert_query_list_to_string(self.survey_year_end)}&"
            f"surveyTypes={self._convert_query_list_to_string(self.survey_type)}&"
            f"surveyCharacteristicsIds={self._convert_query_list_to_string(self.survey_characteristics_ids)}&"
            f"tagIds={self._convert_query_list_to_string(self.tagIds)}"
        )
        logging.info(f"API URL constructed: {self.url}")

    @staticmethod
    def _convert_query_list_to_string(query_list: List[str]) -> str:
        """Convert a list of values into a comma-separated string."""
        return ",".join(query_list) if query_list else ""

    def _fetch_data(self) -> List[dict]:
        """Fetch survey data from the API with error handling."""
        try:
            response = requests.get(self.url, timeout=self._timeout)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx, 5xx)
            data = response.json().get("Data", [])

            if not data:
                logging.warning("API response is empty or missing 'Data' key.")
            return data
        except requests.Timeout:
            logging.error("Request timed out.")
            return []
        except requests.HTTPError as e:
            logging.error(f"HTTP error occurred: {e}")
            return []
        except requests.RequestException as e:
            logging.error(f"Request error: {e}")
            return []
        except ValueError:
            logging.error("Error decoding JSON response.")
            return []

    @staticmethod
    def _convert_data_to_polars(data: List[dict]) -> Optional[pl.DataFrame]:
        """Convert JSON data to a Polars DataFrame, handling empty data cases."""
        try:
            if not data:
                logging.warning("No data available to convert to DataFrame.")
                return None
            return pl.DataFrame(data)
        except Exception as e:
            logging.error(f"Error converting data to Polars DataFrame: {e}")
            return None

    def _select_columns(self, data: pl.DataFrame) -> Optional[pl.DataFrame]:
        """Select specific columns if filter_fields is provided."""
        if not self.filter_fields:
            return data  # Return full DataFrame if no filters are provided

        try:
            # Ensure requested columns exist
            available_columns = set(data.columns)
            missing_columns = set(self.filter_fields) - available_columns
            if missing_columns:
                logging.warning(f"Skipping missing columns: {missing_columns}")

            selected_columns = [col for col in self.filter_fields if col in available_columns]
            return data.select(selected_columns)
        except Exception as e:
            logging.error(f"Error selecting columns: {e}")
            return None

    def get_data(self) -> Optional[pl.DataFrame]:
        """
        Public method to return the final processed DataFrame.

        Returns:
            Optional[pl.DataFrame]: Polars DataFrame or None if an error occurs.
        """
        raw_data = self._fetch_data()
        df = self._convert_data_to_polars(raw_data)
        if df is None:
            return None
        return self._select_columns(df)

__post_init__()

Construct the API URL after initialization.

Source code in pdhs/base_api.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __post_init__(self):
    """Construct the API URL after initialization."""
    self.url = (
        f"http://api.dhsprogram.com/rest/dhs/{self._url_extension}?"
        f"surveyYears={self._convert_query_list_to_string(self.survey_year)}&"
        f"countryIds={self._convert_query_list_to_string(self.country_ids)}&"
        f"indicatorIds={self._convert_query_list_to_string(self.indicator_ids)}&"
        f"surveyYearIds={self._convert_query_list_to_string(self.survey_ids)}&"
        f"surveyYearStarts={self._convert_query_list_to_string(self.survey_year_start)}&"
        f"surveyYearEnds={self._convert_query_list_to_string(self.survey_year_end)}&"
        f"surveyTypes={self._convert_query_list_to_string(self.survey_type)}&"
        f"surveyCharacteristicsIds={self._convert_query_list_to_string(self.survey_characteristics_ids)}&"
        f"tagIds={self._convert_query_list_to_string(self.tagIds)}"
    )
    logging.info(f"API URL constructed: {self.url}")

get_data()

Public method to return the final processed DataFrame.

Returns:

Type Description
Optional[DataFrame]

Optional[pl.DataFrame]: Polars DataFrame or None if an error occurs.

Source code in pdhs/base_api.py
117
118
119
120
121
122
123
124
125
126
127
128
def get_data(self) -> Optional[pl.DataFrame]:
    """
    Public method to return the final processed DataFrame.

    Returns:
        Optional[pl.DataFrame]: Polars DataFrame or None if an error occurs.
    """
    raw_data = self._fetch_data()
    df = self._convert_data_to_polars(raw_data)
    if df is None:
        return None
    return self._select_columns(df)

GetCountries dataclass

Bases: DHSBaseAPI

Class to fetch country data from the DHS API.

Inherits from DHSBaseAPI and uses the 'countries' endpoint.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
filter_fields list

List of fields to filter the data.

list()

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the country data.

Example:

    from pdhs.countries import GetCountries
    countries_data = GetCountries(country_ids = ["AL"])
    df = countries_data.get_data()
    print(df)

Source code in pdhs/countries.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class GetCountries(DHSBaseAPI):
    """
    Class to fetch country data from the DHS API.

    Inherits from DHSBaseAPI and uses the 'countries' endpoint.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        filter_fields (list): List of fields to filter the data.

    Returns:
        DataFrame: A polars DataFrame containing the country data.

    Example:
    ```python
        from pdhs.countries import GetCountries
        countries_data = GetCountries(country_ids = ["AL"])
        df = countries_data.get_data()
        print(df)
    ```
    """
    _url_extension: str = "countries"

GetDatasets dataclass

Bases: DHSBaseAPI

Class to fetch datasets from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
filter_fields list

List of fields to filter the data.

list()
select_surveys str

Comma-separated list of survey IDs to select.

None
file_format str

Format of the files to retrieve (e.g., "DT" for data tables).

None
file_type str

Type of the files to retrieve (e.g., "CSV", "JSON").

None

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the dataset information.

Example:

    from pdhs.datasets import GetDatasets
    indicators_data = GetDatasets(country_ids = ["NG"], file_format = "DT")
    df = indicators_data.get_data()
    print(df)

Source code in pdhs/datasets.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@dataclass
class GetDatasets(DHSBaseAPI):
    """
    Class to fetch datasets from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        filter_fields (list): List of fields to filter the data.
        select_surveys (str): Comma-separated list of survey IDs to select.
        file_format (str): Format of the files to retrieve (e.g., "DT" for data tables).
        file_type (str): Type of the files to retrieve (e.g., "CSV", "JSON").

    Returns:
        DataFrame: A polars DataFrame containing the dataset information.

    Example:
    ```python
        from pdhs.datasets import GetDatasets
        indicators_data = GetDatasets(country_ids = ["NG"], file_format = "DT")
        df = indicators_data.get_data()
        print(df)
    ```
    """
    _url_extension: str = "datasets"
    select_surveys: str = None
    file_format: str = None
    file_type: str = None

    def __post_init__(self):
        super().__post_init__()

        if self.select_surveys is not None:
            self.url += f"&selectSurveys={self.select_surveys}"
        if self.file_format is not None:
            self.url += f"&fileFormat={self.file_format}"
        if self.file_type is not None:
            self.url += f"&fileType={self.file_type}"

        logging.info(f"Extended API URL constructed: {self.url}")

DHSDownloader dataclass

A class to handle downloading datasets from the Demographic and Health Surveys (DHS) Program.

This class provides methods to authenticate with the DHS API, search for available datasets, and download selected datasets to a specified directory.

Requires Playwright for browser automation and requests for HTTP requests.

Parameters:

Name Type Description Default
username str

DHS API username.

required
password str

DHS API password.

required
download_dir str

Directory where datasets will be saved.

required
project_name str

Name of the project to select from the DHS dropdown.

required
dataframe DataFrame

Polars DataFrame containing dataset metadata.

required

Methods:

Name Description
download_all_datasets

list): Downloads all datasets specified by their IDs.

load_dataset_as_dataframe

str): Loads a downloaded dataset into a Polars DataFrame.

Example:

    from pdhs.download import DHSDownloader
    downloader = DHSDownloader(
        email="example@email.com",
        password="your_password",
        project_name="Your Project Name",
        dataframe=GetDatasets(country_ids=["NG"], file_format="DT").get_data()

Source code in pdhs/download.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
@dataclass
class DHSDownloader:
    """
    A class to handle downloading datasets from the Demographic and Health Surveys (DHS) Program.

    This class provides methods to authenticate with the DHS API, search for available datasets,
    and download selected datasets to a specified directory.

    Requires Playwright for browser automation and requests for HTTP requests.

    Args:
        username (str): DHS API username.
        password (str): DHS API password.
        download_dir (str): Directory where datasets will be saved.
        project_name (str): Name of the project to select from the DHS dropdown.
        dataframe (pl.DataFrame): Polars DataFrame containing dataset metadata.

    Methods:
        download_all_datasets(dataset_ids: list): Downloads all datasets specified by their IDs.
        load_dataset_as_dataframe(dataset_id: str): Loads a downloaded dataset into a Polars DataFrame.

    Example:
    ```python
        from pdhs.download import DHSDownloader
        downloader = DHSDownloader(
            email="example@email.com",
            password="your_password",
            project_name="Your Project Name",
            dataframe=GetDatasets(country_ids=["NG"], file_format="DT").get_data()
    ```
    """

    email: str
    password: str
    project_name: str
    dataframe: pl.DataFrame
    download_path: Optional[str] = None

    def __post_init__(self):
        if self.download_path is None:
            self.download_path = "downloads"

    async def download_all_datasets(self, dataset_ids: list):
        """
        Iterates over the provided dataset IDs and downloads each dataset.
        Args:
            dataset_ids (list): List of dataset IDs to download.
        """
        for dataset_id in dataset_ids:
            try:
                await self._download_single_dataset(dataset_id)
            except ValueError as e:
                print(f"Skipping dataset {dataset_id} due to error: {e}")

    async def _download_single_dataset(self, dataset_id: str):
        """
        Downloads a single dataset by filtering the dataframe and automating the download process.
        """
        # Automatically determine the FileName column
        file_name_column = "FileName"
        if file_name_column not in self.dataframe.columns:
            raise ValueError(f"Column '{file_name_column}' not found in the provided dataframe.")

        # Filter the dataframe for the current dataset_id
        filtered_df = self.dataframe.filter(pl.col(file_name_column) == dataset_id)

        # Extract values from the filtered dataframe
        if filtered_df.is_empty():
            raise ValueError(f"No data found for dataset_id: {dataset_id}")

        country_name = filtered_df['CountryName'][0]
        country_code = filtered_df['DHS_CountryCode'][0]
        survey_id = filtered_df['SurveyNum'][0]

        # Print extracted values (optional)
        print(f"Downloading dataset: {dataset_id}")
        print(f"Country Name: {country_name}")
        print(f"Country Code: {country_code}")
        print(f"Survey ID: {survey_id}")

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(accept_downloads=True)
            page = await context.new_page()

            # Navigate to the DHS login page
            await page.goto("https://dhsprogram.com/data/dataset_admin/login_main.cfm")

            # Fill in the login form
            await page.fill("input[name='UserName']", self.email)
            await page.fill("input[name='UserPass']", self.password)

            # Submit the login form
            await page.click("input[type='submit']")

            # Wait for navigation after login
            await page.wait_for_load_state("networkidle")

            # Select the project from the dropdown
            await page.select_option("select[name='proj_id']", label=self.project_name)

            # Wait for the project selection to complete
            await page.wait_for_load_state("networkidle")

            # Extract cookies from Playwright
            cookies = await context.cookies()
            session = requests.Session()
            for cookie in cookies:
                session.cookies.set(cookie['name'], cookie['value'])

            # Directly download the dataset using the provided URL
            download_url = f"https://dhsprogram.com/customcf/legacy/data/download_dataset.cfm?Filename={dataset_id}&Tp=1&Ctry_Code={country_code}&surv_id={survey_id}&dm=1&dmode=nm"
            save_path = f"{self.download_path}/{dataset_id}"
            self._download_file_with_session(session, download_url, save_path)

            # Close the browser
            await browser.close()

    @staticmethod
    def _download_file_with_session(session, url, save_path):
        """
        Downloads a file using a session with cookies and saves it to the specified path.

        Args:
            session (requests.Session): The session with cookies.
            url (str): The URL of the file to download.
            save_path (str): The local path where the file will be saved.
        """
        try:
            # Create the directory if it doesn't exist
            os.makedirs(os.path.dirname(save_path), exist_ok=True)

            response = session.get(url, stream=True)
            response.raise_for_status()

            with open(save_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"File downloaded successfully and saved to {save_path}")
        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")

    def load_dataset_as_dataframe(self, dataset_id: str) -> pl.DataFrame:
        """
        Loads a downloaded dataset into a Polars DataFrame.

        Args:
            dataset_id (str): The ID of the dataset to load.

        Returns:
            pl.DataFrame: The dataset loaded as a Polars DataFrame.
        """
        file_path = f"{self.download_path}/{dataset_id}"
        try:
            # Check if the file is a ZIP file
            if file_path.lower().endswith(".zip"):
                # Extract the ZIP file
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(self.download_path)
                    print(f"Extracted {dataset_id} to {self.download_path}")

                # Find all extracted files
                extracted_files = [
                    f for f in os.listdir(self.download_path)
                    if os.path.isfile(os.path.join(self.download_path, f)) and not f.endswith(".zip")
                ]

                # Filter files by supported extensions
                supported_extensions = ["csv", "dat", "dta", "sas7bdat", "sav"]
                extracted_files = [
                    f for f in extracted_files
                    if f.split('.')[-1].lower() in supported_extensions
                ]

                if not extracted_files:
                    raise FileNotFoundError(f"No supported files found after extracting {dataset_id}")

                # Select the first supported file (or implement custom logic to choose)
                file_path = os.path.join(self.download_path, extracted_files[0])
                print(f"Selected file for loading: {file_path}")

            # Determine the file extension
            file_extension = file_path.split('.')[-1].lower()

            if file_extension == "csv" or file_extension == "dat":
                # Load CSV or DAT files using Polars
                df = pl.read_csv(file_path)
            elif file_extension == "dta":
                # Load Stata files using pyreadstat
                df, meta = pyreadstat.read_dta(file_path)
                df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
            elif file_extension == "sas7bdat":
                # Load SAS files using pyreadstat
                df, meta = pyreadstat.read_sas7bdat(file_path)
                df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
            elif file_extension == "sav":
                # Load SPSS files using pyreadstat
                df, meta = pyreadstat.read_sav(file_path)
                df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
            else:
                raise ValueError(f"Unsupported file format: {file_extension}")

            print(f"Dataset {dataset_id} loaded successfully.")
            return df
        except FileNotFoundError:
            print(f"Dataset {dataset_id} not found in {self.download_path}.")
        except Exception as e:
            print(f"An error occurred while loading the dataset: {e}")

download_all_datasets(dataset_ids) async

Iterates over the provided dataset IDs and downloads each dataset. Args: dataset_ids (list): List of dataset IDs to download.

Source code in pdhs/download.py
56
57
58
59
60
61
62
63
64
65
66
async def download_all_datasets(self, dataset_ids: list):
    """
    Iterates over the provided dataset IDs and downloads each dataset.
    Args:
        dataset_ids (list): List of dataset IDs to download.
    """
    for dataset_id in dataset_ids:
        try:
            await self._download_single_dataset(dataset_id)
        except ValueError as e:
            print(f"Skipping dataset {dataset_id} due to error: {e}")

load_dataset_as_dataframe(dataset_id)

Loads a downloaded dataset into a Polars DataFrame.

Parameters:

Name Type Description Default
dataset_id str

The ID of the dataset to load.

required

Returns:

Type Description
DataFrame

pl.DataFrame: The dataset loaded as a Polars DataFrame.

Source code in pdhs/download.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def load_dataset_as_dataframe(self, dataset_id: str) -> pl.DataFrame:
    """
    Loads a downloaded dataset into a Polars DataFrame.

    Args:
        dataset_id (str): The ID of the dataset to load.

    Returns:
        pl.DataFrame: The dataset loaded as a Polars DataFrame.
    """
    file_path = f"{self.download_path}/{dataset_id}"
    try:
        # Check if the file is a ZIP file
        if file_path.lower().endswith(".zip"):
            # Extract the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(self.download_path)
                print(f"Extracted {dataset_id} to {self.download_path}")

            # Find all extracted files
            extracted_files = [
                f for f in os.listdir(self.download_path)
                if os.path.isfile(os.path.join(self.download_path, f)) and not f.endswith(".zip")
            ]

            # Filter files by supported extensions
            supported_extensions = ["csv", "dat", "dta", "sas7bdat", "sav"]
            extracted_files = [
                f for f in extracted_files
                if f.split('.')[-1].lower() in supported_extensions
            ]

            if not extracted_files:
                raise FileNotFoundError(f"No supported files found after extracting {dataset_id}")

            # Select the first supported file (or implement custom logic to choose)
            file_path = os.path.join(self.download_path, extracted_files[0])
            print(f"Selected file for loading: {file_path}")

        # Determine the file extension
        file_extension = file_path.split('.')[-1].lower()

        if file_extension == "csv" or file_extension == "dat":
            # Load CSV or DAT files using Polars
            df = pl.read_csv(file_path)
        elif file_extension == "dta":
            # Load Stata files using pyreadstat
            df, meta = pyreadstat.read_dta(file_path)
            df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
        elif file_extension == "sas7bdat":
            # Load SAS files using pyreadstat
            df, meta = pyreadstat.read_sas7bdat(file_path)
            df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
        elif file_extension == "sav":
            # Load SPSS files using pyreadstat
            df, meta = pyreadstat.read_sav(file_path)
            df = pl.DataFrame(df)  # Convert pandas DataFrame to Polars DataFrame
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

        print(f"Dataset {dataset_id} loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"Dataset {dataset_id} not found in {self.download_path}.")
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")

GetGeometry dataclass

Bases: DHSBaseAPI

Class to fetch geometry data from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
filter_fields list

List of fields to filter the data.

list()
country_ids list

List of country IDs to filter the data.

list()

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the geometry data.

Example: ```python from pdhs.geometry import GetGeometry geometry_data = GetGeometry(country_ids = ["AL"]) df = geometry_data.get_data() print(df)

```

Source code in pdhs/geometry.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class GetGeometry(DHSBaseAPI):
    """
    Class to fetch geometry data from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        filter_fields (list): List of fields to filter the data.
        country_ids (list): List of country IDs to filter the data.

    Returns:
        DataFrame: A polars DataFrame containing the geometry data.

    Example:
    ```python
        from pdhs.geometry import GetGeometry
        geometry_data = GetGeometry(country_ids = ["AL"])
        df = geometry_data.get_data()
        print(df)

     ```
    """
    _url_extension: str = "geometry"

GetIndicatorsData dataclass

Bases: DHSBaseAPI

Class to fetch indicators data from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
characteristic_category list

List of characteristic categories to filter the data.

list()
characteristic_label list

List of characteristic labels to filter the data.

list()
breakdown str

Breakdown type for the indicators.

''

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the indicators data.

Example:

    from pdhs.indicators import GetIndicatorsData
    indicators_data = GetIndicatorsData(
        country_ids=["AL"],
        characteristic_category=["wealth quintile", "region"],
        characteristic_label=["middle", "second"],
        breakdown="all"
    )
    df = indicators_data.get_data()
    print(df)

Source code in pdhs/indicators.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@dataclass
class GetIndicatorsData(DHSBaseAPI):
    """
    Class to fetch indicators data from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        characteristic_category (list): List of characteristic categories to filter the data.
        characteristic_label (list): List of characteristic labels to filter the data.
        breakdown (str): Breakdown type for the indicators.

    Returns:
        DataFrame: A polars DataFrame containing the indicators data.

    Example:
    ```python
        from pdhs.indicators import GetIndicatorsData
        indicators_data = GetIndicatorsData(
            country_ids=["AL"],
            characteristic_category=["wealth quintile", "region"],
            characteristic_label=["middle", "second"],
            breakdown="all"
        )
        df = indicators_data.get_data()
        print(df)
    ```
    """
    _url_extension: str = "data"
    characteristic_category: List[str] = field(default_factory=list)
    characteristic_label: List[str] = field(default_factory=list)
    breakdown: str = ""

    @staticmethod
    def _convert_special_characters(input_string: str) -> str:
        """
        Converts spaces in a string to '%20' and '+' to '%2B'.

        Args:
            input_string (str): The input string to be converted.

        Returns:
            str: The converted string with spaces replaced by '%20' and '+' replaced by '%2B'.
        """
        return input_string.replace(" ", "%20").replace("+", "%2B")

    def __post_init__(self):
        super().__post_init__()
        # Apply _convert_special_characters to each string in characteristic_category
        self.characteristic_category = [
            self._convert_special_characters(cat) for cat in self.characteristic_category
        ]
        self.characteristic_label = [
            self._convert_special_characters(label) for label in self.characteristic_label
        ]
        self.url += (f"&characteristicCatrgory={self._convert_query_list_to_string(self.characteristic_category)}"
                    f"&characteristicLabel={self._convert_query_list_to_string(self.characteristic_label)}"
                    f"&breakdown={self.breakdown}")
        logging.info(f"Extended API URL constructed: {self.url}")

GetInfo dataclass

Bases: DHSBaseAPI

Class to fetch information from the DHS API.

Parameters:

Name Type Description Default
info_type str

Type of information to retrieve (e.g., "version", "citation").

None

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the requested information.

Example:

    from pdhs.info import GetInfo
    get_info = GetInfo(info_type="citation")
    df = get_info.get_data()
    print(df)

Source code in pdhs/info.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class GetInfo(DHSBaseAPI):
    """
    Class to fetch information from the DHS API.

    Args:
        info_type (str): Type of information to retrieve (e.g., "version", "citation").

    Returns:
        DataFrame: A polars DataFrame containing the requested information.

    Example:
    ```python
        from pdhs.info import GetInfo
        get_info = GetInfo(info_type="citation")
        df = get_info.get_data()
        print(df)
    ```
    """
    _url_extension: str = "info"
    info_type: str = "version"

    def __init__(self, info_type: str = None):
        # Explicitly initialize only the attributes you want to expose
        self.info_type = info_type
        # Pass the required _url_extension to the base class
        super().__init__(_url_extension=self._url_extension)

    def __post_init__(self):
        super().__post_init__()
        self.url = f"http://api.dhsprogram.com/rest/dhs/info?infoType={self.info_type}"

GetPublications dataclass

Bases: DHSBaseAPI

Class to fetch publications from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()
filter_fields list

List of fields to filter the data.

list()

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the publications data.

Example:

    from pdhs.publications import GetPublications
    get_publications = GetPublications(country_ids=["AL"])
    df = get_publications.get_data()
    print(df)

Source code in pdhs/publications.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class GetPublications(DHSBaseAPI):
    """
    Class to fetch publications from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.
        filter_fields (list): List of fields to filter the data.

    Returns:
        DataFrame: A polars DataFrame containing the publications data.

    Example:
    ```python
        from pdhs.publications import GetPublications
        get_publications = GetPublications(country_ids=["AL"])
        df = get_publications.get_data()
        print(df)
    ```
    """
    _url_extension: str = "publications"

GetSurveys dataclass

Bases: DHSBaseAPI

Class to fetch survey data from the DHS API.

Parameters:

Name Type Description Default
country_ids list

List of country IDs to filter the data.

list()
survey_status str

Status of the surveys to filter (e.g., "completed", "ongoing").

None
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the survey data.

Example:

    from pdhs.surveys import GetSurveys
    survey_data = GetSurveys(
        country_ids=["NG"],
        survey_status="completed",
    )
    df = survey_data.get_data()
    print(df)   

Source code in pdhs/surveys.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@dataclass
class GetSurveys(DHSBaseAPI):
    """
    Class to fetch survey data from the DHS API.

    Args:
        country_ids (list): List of country IDs to filter the data.
        survey_status (str): Status of the surveys to filter (e.g., "completed", "ongoing").
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.

    Returns:
        DataFrame: A polars DataFrame containing the survey data.

    Example:
    ```python
        from pdhs.surveys import GetSurveys
        survey_data = GetSurveys(
            country_ids=["NG"],
            survey_status="completed",
        )
        df = survey_data.get_data()
        print(df)   
    ```
    """
    _url_extension: str = "surveys"
    survey_status: str = None


    def __post_init__(self):
        super().__post_init__()
        if self.survey_status is not None:
            self.url += (f"&surveyStatus={self.survey_status}")
        logging.info(f"Extended API URL constructed: {self.url}")

GetTags dataclass

Bases: DHSBaseAPI

Class to fetch tags from the DHS API.

Parameters:

Name Type Description Default
indicator_ids list

List of indicator IDs to filter the data.

list()
survey_ids list

List of survey IDs to filter the data.

list()
survey_year list

List of survey years to filter the data.

list()
survey_year_start list

List of survey year start dates to filter the data.

list()
survey_year_end list

List of survey year end dates to filter the data.

list()
survey_type list

List of survey types to filter the data.

list()
survey_characteristics_ids list

List of survey characteristics IDs to filter the data.

list()
tagIds list

List of tag IDs to filter the data.

list()

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the tags data.

Example:

    from pdhs.tags import GetTags
    Tags_data = GetTags(indicator_ids=["FE_FRTR_W_TFR"])
    df = Tags_data.get_data()
    print(df)   

Source code in pdhs/tags.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@dataclass
class GetTags(DHSBaseAPI):
    """
    Class to fetch tags from the DHS API.

    Args:
        indicator_ids (list): List of indicator IDs to filter the data.
        survey_ids (list): List of survey IDs to filter the data.
        survey_year (list): List of survey years to filter the data.
        survey_year_start (list): List of survey year start dates to filter the data.
        survey_year_end (list): List of survey year end dates to filter the data.
        survey_type (list): List of survey types to filter the data.
        survey_characteristics_ids (list): List of survey characteristics IDs to filter the data.
        tagIds (list): List of tag IDs to filter the data.

    Returns:
        DataFrame: A polars DataFrame containing the tags data.

    Example:
    ```python
        from pdhs.tags import GetTags
        Tags_data = GetTags(indicator_ids=["FE_FRTR_W_TFR"])
        df = Tags_data.get_data()
        print(df)   
    ```
    """
    _url_extension: str = "tags"

GetDataUpdates dataclass

Bases: DHSBaseAPI

Class to fetch data updates from the DHS API.

Parameters:

Name Type Description Default
last_update str

The date of the last update in YYYYMMDD format.

None

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the data updates.

Example:

    from pdhs.updates import GetDataUpdates
    data_update = GetDataUpdates(last_update="20150901")
    df = data_update.get_data()
    print(df)

Source code in pdhs/updates.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@dataclass
class GetDataUpdates(DHSBaseAPI):
    """
    Class to fetch data updates from the DHS API.

    Args:
        last_update (str): The date of the last update in YYYYMMDD format.
        This parameter is optional and can be used to filter updates since a specific date.

    Returns:
        DataFrame: A polars DataFrame containing the data updates.  

    Example:
    ```python
        from pdhs.updates import GetDataUpdates
        data_update = GetDataUpdates(last_update="20150901")
        df = data_update.get_data()
        print(df)
    ```
    """
    _url_extension: str = "dataupdates"
    last_update: str = None

    def __init__(self, last_update: str = None):
        # Explicitly initialize only the attributes you want to expose
        self.last_update = last_update
        # Pass the required _url_extension to the base class
        super().__init__(_url_extension=self._url_extension)

    def __post_init__(self):
        super().__post_init__()
        if self.last_update is not None:
            self.url += f"&lastUpdates={self.last_update}"

GetUIUpdates dataclass

Bases: DHSBaseAPI

Class to fetch UI updates from the DHS API.

Parameters:

Name Type Description Default
last_update str

The date of the last update in YYYYMMDD format.

None

Returns:

Name Type Description
DataFrame

A polars DataFrame containing the UI updates.

Example

ui_update = GetUIUpdates(last_update="20150901") df2 = ui_update.get_data() print(df2)

Source code in pdhs/updates.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@dataclass
class GetUIUpdates(DHSBaseAPI):
    """
    Class to fetch UI updates from the DHS API.

    Args:
        last_update (str): The date of the last update in YYYYMMDD format.
        This parameter is optional and can be used to filter updates since a specific date.

    Returns:
        DataFrame: A polars DataFrame containing the UI updates.

    Example:
        ui_update = GetUIUpdates(last_update="20150901")
        df2 = ui_update.get_data()
        print(df2)
    """
    _url_extension: str = "uiupdates"
    last_update: str = None

    def __init__(self, last_update: str = None):
        # Explicitly initialize only the attributes you want to expose
        self.last_update = last_update
        # Pass the required _url_extension to the base class
        super().__init__(_url_extension=self._url_extension)

    def __post_init__(self):
        super().__post_init__()
        if self.last_update is not None:
            self.url += f"&lastUpdates={self.last_update}"