Skip to content
145 changes: 145 additions & 0 deletions analysis/all_time_counts_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os
import numpy as np
import pandas as pd
from common_variables import demographic_variables


pd.set_option("display.max_rows", 50)
results_path = "output/v2/practice_summ.txt"
stratifiers = list(demographic_variables.keys())
long_covid_codelists = [
"opensafely-nice-managing-the-long-term-effects-of-covid-19",
"opensafely-referral-and-signposting-for-long-covid",
"opensafely-assessment-instruments-and-outcome-measures-for-long-covid",
"user-alex-walker-post-viral-syndrome",
]
combined_codelists = [
pd.read_csv(f"codelists/{path}.csv", index_col="code")
for path in long_covid_codelists
]
combined_codelists = pd.concat(combined_codelists)
individual_code_dates = [f"snomed_{c}_date" for c in combined_codelists.index]

ethnicity_codelist = pd.read_csv(f"codelists/opensafely-ethnicity.csv", index_col="Code")
Copy link
Copy Markdown
Author

@rebkwok rebkwok Jul 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New - read the ethnicity codelist so we can use it for categorising



def crosstab(idx):
cols = ["No long COVID", "Long COVID", "Rate per 100,000", "%"]
counts = pd.crosstab(idx, df["long_covid"], normalize=False, dropna=False)
rates = (
pd.crosstab(idx, df["long_covid"], normalize="index", dropna=False)[1] * 100000
).round(1)
percentages = (
pd.crosstab(idx, df["long_covid"], normalize="columns", dropna=False)[1] * 100
).round(1)
all_cols = pd.concat([counts, rates, percentages], axis=1)
all_cols.columns = cols
return all_cols


def redact_small_numbers(df, column):
mask = df[column].isin([1, 2, 3, 4, 5])
df.loc[mask, :] = np.nan
return df


def write_to_file(text_to_write, erase=False):
if erase and os.path.isfile(results_path):
os.remove(results_path)
with open(results_path, "a") as txt:
txt.writelines(f"{text_to_write}\n")
print(text_to_write)
txt.writelines("\n")
print("\n")


df = pd.read_csv(
"output/v2/input_cohort.csv",
index_col="patient_id",
parse_dates=[
"first_long_covid_date",
"first_post_viral_fatigue_date",
"sgss_positive",
"primary_care_covid",
"hospital_covid",
]
+ individual_code_dates,
)

# Replace ethnicity codes with categories and fill missing values
def _apply_ethnicity_category(ethnicity_code):
if pd.isnull(ethnicity_code):
return 0
loc = ethnicity_codelist.index.get_loc(ethnicity_code)
return ethnicity_codelist.iloc[loc].Grouping_6
df["ethnicity"] = df.apply(lambda row: _apply_ethnicity_category(row.ethnicity), axis=1)
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 69-75 are new, to covert ethnicity codes into categories


# Surface missing values
df["region"] = df["region"].fillna("AaMissing")

# Find first COVID date
first_covid_date = df[["sgss_positive", "primary_care_covid", "hospital_covid"]].min(
axis=1
)

## Crosstabs
crosstabs = [crosstab(df[v]) for v in stratifiers]
all_together = pd.concat(
crosstabs, axis=0, keys=stratifiers + ["imd"], names=["Attribute", "Category"]
)
print(all_together)
redact_small_numbers(all_together, "Long COVID").to_csv("output/v2/counts_table.csv")

## All long-covid codes table
codes = [str(code) for code in combined_codelists.index]
all_codes = df.copy()
all_codes.columns = all_codes.columns.str.lstrip("snomed_")
all_codes = all_codes[codes].sum().T
all_codes = all_codes.rename("Total records")
all_codes.index = all_codes.index.astype("int64")
all_codes = combined_codelists.join(all_codes)
all_codes["%"] = (all_codes["Total records"] / all_codes["Total records"].sum()) * 100
redact_small_numbers(all_codes, "Total records").to_csv(
"output/v2/all_long_covid_codes.csv"
)
print(all_codes.columns)

## Descriptives by practice
by_practice = (
df[["long_covid", "practice_id"]].groupby("practice_id").sum()["long_covid"]
)
write_to_file(f"Total patients coded: {by_practice.sum()}", erase=True)
top_10_count = by_practice.sort_values().tail(10).sum()
write_to_file(f"Patients coded in the highest 10 practices: {top_10_count}")
practice_summ = by_practice.describe()
write_to_file(f"Summary stats by practice:\n{practice_summ}")
ranges = [-1, 0, 1, 2, 3, 4, 5, 10, 10000]
practice_distribution = by_practice.groupby(pd.cut(by_practice, ranges)).count()
write_to_file(f"Distribution of coding within practices: {practice_distribution}")
practice_distribution.to_csv("output/v2/practice_distribution.csv")


def weekly_counts(variable):
weekly_counts = df.set_index(f"first_{variable}_date")[variable]
weekly_counts = weekly_counts.resample("W").count()
weekly_counts = weekly_counts.loc["2020-01-01":]
weekly_counts.loc[weekly_counts.isin([1, 2, 3, 4, 5])] = np.nan
print(weekly_counts)
weekly_counts.to_csv(f"output/v2/code_use_per_week_{variable}.csv")


weekly_counts("long_covid")
weekly_counts("post_viral_fatigue")

## COVID to long COVID interval
def interval_until(col):
interval = (df[col] - first_covid_date).dt.days.dropna()
bins = [-1000, -1, 0, 28, 56, 84, 112, 140, 168, 196, 1000]
interval = interval.groupby(pd.cut(interval, bins)).count()
interval.loc[interval.isin([1, 2, 3, 4, 5])] = np.nan
write_to_file(f"Timing of {col} relative to COVID:\n{interval}")
interval.to_csv(f"output/v2/interval_{col}.csv")


for col in ["first_long_covid_date"] + individual_code_dates[0:5]:
interval_until(col)
55 changes: 55 additions & 0 deletions analysis/codelists_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from cohortextractor import codelist, codelist_from_csv, combine_codelists

covid_codes = codelist_from_csv(
"codelists/opensafely-covid-identification.csv",
system="icd10",
column="icd10_code",
)
covid_primary_care_positive_test = codelist_from_csv(
"codelists/opensafely-covid-identification-in-primary-care-probable-covid-positive-test.csv",
system="ctv3",
column="CTV3ID",
)
covid_primary_care_code = codelist_from_csv(
"codelists/opensafely-covid-identification-in-primary-care-probable-covid-clinical-code.csv",
system="ctv3",
column="CTV3ID",
)
covid_primary_care_sequalae = codelist_from_csv(
"codelists/opensafely-covid-identification-in-primary-care-probable-covid-sequelae.csv",
system="ctv3",
column="CTV3ID",
)
any_primary_care_code = combine_codelists(
covid_primary_care_code,
covid_primary_care_positive_test,
covid_primary_care_sequalae,
)
long_covid_diagnostic_codes = codelist_from_csv(
"codelists/opensafely-nice-managing-the-long-term-effects-of-covid-19.csv",
system="snomed",
column="code",
)
long_covid_referral_codes = codelist_from_csv(
"codelists/opensafely-referral-and-signposting-for-long-covid.csv",
system="snomed",
column="code",
)
long_covid_assessment_codes = codelist_from_csv(
"codelists/opensafely-assessment-instruments-and-outcome-measures-for-long-covid.csv",
system="snomed",
column="code",
)
any_long_covid_code = combine_codelists(
long_covid_diagnostic_codes, long_covid_referral_codes, long_covid_assessment_codes
)
post_viral_fatigue_codes = codelist_from_csv(
"codelists/user-alex-walker-post-viral-syndrome.csv",
system="snomed",
column="code",
)
ethnicity_codes = codelist_from_csv(
"codelists/opensafely-ethnicity.csv",
system="ctv3",
column="Code",
)
Loading