import sys
!{sys.executable} -m pip install logomaker biopython requests

import Bio.SeqIO
import Bio.SeqUtils
from Bio import motifs
from Bio.Seq import Seq
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from requests import get
from io import StringIO
import logomaker
import itertools

path_fasta = ("../data/motif_alignments.fa")
fasta_db = Bio.SeqIO.parse(path_fasta, "fasta")
fasta_db = [str(i.seq) for i in fasta_db]

fasta_db[0:5]

['AGAGAAGGAAAUAUCGAGAAAUGCCACAGGGAACUGUGAAGUGGUUCAAC',
 'GCUUCAUGAUCGUCGGUUGCUCCCCAGCGGUCACCGGCGGUGACACCACC',
 'UGUCCGGAGCAACCCACCACAUGCCAAGUCCCUCCGUCACCUCGCCGCAA',
 'UCGUGGCGAGCGUGAACAUCAAGCCACUCGAGGACAAGAUCCUCGUUCAG',
 'UCGCCAACUUCACCGUGGCGUCGACACCGCGCAUGUUCGACCGCCAGAGC']

logomaker.alignment_to_matrix(fasta_db, to_type="counts")[0:5]

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

logomaker.alignment_to_matrix(fasta_db, to_type="weight")[0:5]

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

array_bg = [0.305, 0.195, 0.195, 0.305]
logomaker.alignment_to_matrix(
    fasta_db, to_type="weight", background=np.array([0.305, 0.195, 0.195, 0.305])
)[0:5]

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

def plot_logo(data, to_type="weight", background=None,
    ignore="", title="", ylim=[-6, 2],
    colors = "classic", vpad=0, figsize = [10,3]):
    data_logo = logomaker.alignment_to_matrix(
        data, to_type=to_type, background=background,
        characters_to_ignore=ignore
    )
    data_logo.index = data_logo.index + 1
    title = "{0} (n = {1})".format(title, len(data))
    logo = logomaker.Logo(data_logo,
        fade_below=0.33, flip_below = False,
        color_scheme=colors,
        vpad=vpad, figsize = figsize)
    logo.ax.set_ylim(ylim)
    logo.ax.text(0.5, ylim[1] + 0.1, title, fontsize=14)
    logo.ax.set_xticks(np.arange(2, len(data[0]) + 1, 2))

plot_logo(fasta_db, title="all cleavage sites")

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# define custom colors for the four bases
colors = {"A": "#E7298A", "C": "#66A61E", "G": "#E6AB02", "U": "#999999"}
plot_logo(fasta_db, title="all cleavage sites", colors=colors)

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# use a custom background array for A, C, G, U
plot_logo(fasta_db, title="all cleavage sites",
    background=np.array([0.305, 0.195, 0.195, 0.305]),
    colors=colors
)

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# use probability instead of weight as alternative
plot_logo(fasta_db, title="all cleavage sites", colors=colors,
    to_type="probability", ylim=[0, 1])

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# use information instead of weight as alternative
plot_logo(fasta_db, title="all cleavage sites", colors=colors,
    to_type="information", ylim=[0, 1])

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# leave more space between letters: vpad parameter between 0 and 1
plot_logo(fasta_db, title="all cleavage sites", colors=colors, vpad = 0.5)

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# change size and xport figure
plot_logo(fasta_db, title="all cleavage sites", colors=colors, figsize = [7.0,3.0])
plt.savefig("../output/sequence_logo.svg")

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# compute k-mers, for example with length = 5
kmer_length = 5
kmers = [''.join(p) for p in itertools.product(['G', 'A', 'U', 'C'], repeat=kmer_length)]
print("A total of {0} k-mers with length {1} have been computed.".format(len(kmers), kmer_length))

# make data frame with probabilities
df_prob = logomaker.alignment_to_matrix(
    fasta_db, to_type="probability", background=np.array([0.305, 0.195, 0.195, 0.305])
)

# select a window of length 5, here the interesting region with CA motif
df_prob = df_prob.iloc[23:28].reset_index()

A total of 1024 k-mers with length 5 have been computed.

/home/michael/micromamba/envs/jupyter/lib/python3.9/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

# extract nucleotide probabilities and make new df
result = {}
for kmer in kmers:
    result[kmer] = [df_prob.at[pos, kmer[pos]] for pos in range(0, len(kmer))]

result = (
    pd.DataFrame(result)
    .melt()
    .groupby(by="variable", as_index=False, sort=False)
    .agg(["sum", "mean", "median"])
    .value
    .sort_values("sum", ascending=False)
)

result["rank"] = result["sum"].rank(ascending=False)

# print top 10 hits
result.head(10)

fasta_dna = [i.replace("U", "T") for i in fasta_db]
motif = motifs.create([Seq(i[23:28]) for i in fasta_dna], alphabet="ACGT")
motif.consensus

Seq('CCACG')

motif.degenerate_consensus

Seq('NCANN')

ref_seq=Seq("".join(fasta_dna))
print(f"Length of reference sequence: {len(ref_seq)}")
ref_seq

Length of reference sequence: 1300

Seq('AGAGAAGGAAATATCGAGAAATGCCACAGGGAACTGTGAAGTGGTTCAACGCTT...GTG')

result2 = pd.DataFrame(columns=(["start", "end", "strand", "seq", "score"]))

for position, score in motif.pssm.search(ref_seq, threshold=4.0):
    result2 = pd.concat(
        [
            result2,
            pd.DataFrame({
                "start": position,
                "end": position + kmer_length,
                "strand": "+" if position > 0 else "-",
                "seq": str(ref_seq[position:position + kmer_length]),
                "score": score,

            }, index = [1]),
        ]
    ).sort_values("score", ascending=False)

result2["rank"] = result2["score"].rank(ascending=False)

# print top 10 hits
result2.head(10)

!pip freeze

anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1688651106312/work/dist
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1692818318753/work
argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_1649500321618/work
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1667325728359/work
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1683424013410/work
backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1680888073205/work
biopython==1.79
black==22.10.0
bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1696630167146/work
certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1700303426725/work/certifi
cffi @ file:///tmp/abs_98z5h56wf8/croots/recipe/cffi_1659598650955/work
chardet==4.0.0
charset-normalizer==3.0.1
click==8.1.3
contourpy==1.0.6
cycler==0.11.0
debugpy @ file:///tmp/build/80754af9/debugpy_1637091799509/work
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1643888246732/work
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1700579780973/work
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1667317341051/work
fastjsonschema @ file:///home/conda/feedstock_root/build_artifacts/python-fastjsonschema_1700055509243/work/dist
fonttools==4.38.0
idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1701026962277/work
importlib-metadata==7.0.0
importlib-resources @ file:///home/conda/feedstock_root/build_artifacts/importlib_resources_1699364556997/work
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1667262163608/work
ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1667140637743/work
ipython-genutils==0.2.0
isodate==0.6.1
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1659959867326/work
Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema-meta_1700159890288/work
jsonschema-specifications @ file:///home/conda/feedstock_root/build_artifacts/jsonschema-specifications_1701365715051/work
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1633454794268/work
jupyter-server @ file:///croot/jupyter_server_1671707632269/work
jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1667424442212/work
jupyterlab_pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1700744013163/work
kiwisolver==1.4.4
logomaker==0.8
MarkupSafe @ file:///opt/conda/conda-bld/markupsafe_1654597864307/work
matplotlib==3.6.2
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1660814786464/work
mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1698947099619/work
mypy-extensions==0.4.3
nbclassic @ file:///home/conda/feedstock_root/build_artifacts/nbclassic_1683202081046/work
nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1662750566673/work
nbconvert==7.12.0
nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1690814868471/work
nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1664684991461/work
notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1695225629675/work
notebook_shim @ file:///home/conda/feedstock_root/build_artifacts/notebook-shim_1682360583588/work
numpy==1.23.4
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1637239678211/work
pandas==1.5.1
pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1638334955874/work
pathspec==0.10.2
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1667297516076/work
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
Pillow==9.3.0
pkgutil_resolve_name @ file:///home/conda/feedstock_root/build_artifacts/pkgutil-resolve-name_1694617248815/work
platformdirs==2.5.4
prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1700579315247/work
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1667565496306/work
psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1636257122734/work
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1660666458521/work
pyopenproject==0.7.4
pyparsing @ file:///home/conda/feedstock_root/build_artifacts/pyparsing_1652235407899/work
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
pytz==2022.6
PyYAML==5.3.1
pyzmq==19.0.2
referencing @ file:///home/conda/feedstock_root/build_artifacts/referencing_1701974441252/work
requests==2.25.1
rpds-py @ file:///croot/rpds-py_1698945930462/work
Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1682601222253/work
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
sniffio @ file:///home/conda/feedstock_root/build_artifacts/sniffio_1662051266223/work
soupsieve @ file:///home/conda/feedstock_root/build_artifacts/soupsieve_1693929250441/work
stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1667334518791/work
terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1699810101464/work
tinycss2 @ file:///home/conda/feedstock_root/build_artifacts/tinycss2_1666100256010/work
tomli==2.0.1
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827245914/work
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1666115969632/work
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1702176139754/work
urllib3==1.26.14
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1600965781394/work
webencodings @ file:///home/conda/feedstock_root/build_artifacts/webencodings_1694681268211/work
websocket-client @ file:///home/conda/feedstock_root/build_artifacts/websocket-client_1701630677416/work
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1695255097490/work

	A	C	G	U
pos
0	-0.321928	0.415037	0.093109	-0.321928
1	-0.584963	0.900464	0.415037	-2.906891
2	0.263034	-0.099536	-0.099536	-0.099536
3	-0.321928	-0.099536	-0.321928	0.552541
4	-0.584963	0.415037	0.263034	-0.321928

	A	C	G	U
pos
0	-0.608809	0.773491	0.451563	-0.608809
1	-0.871844	1.258918	0.773491	-3.193772
2	-0.023847	0.258918	0.258918	-0.386417
3	-0.608809	0.258918	0.036526	0.265660
4	-0.871844	0.773491	0.621488	-0.608809

	sum	mean	median	rank
variable
CCACG	2.500000	0.500000	0.400000	1.0
ACACG	2.466667	0.493333	0.400000	2.0
CCACC	2.400000	0.480000	0.400000	3.0
CCAGG	2.400000	0.480000	0.366667	4.0
ACAGG	2.366667	0.473333	0.366667	6.0
ACACC	2.366667	0.473333	0.400000	6.0
CCACA	2.366667	0.473333	0.400000	6.0
ACACA	2.333333	0.466667	0.400000	8.0
UCACG	2.300000	0.460000	0.400000	9.5
CCAGC	2.300000	0.460000	0.366667	9.5

	start	end	strand	seq	score	rank
1	-1149	-1144	-	CGTGG	5.163332	3.0
1	-1087	-1082	-	CGTGG	5.163332	3.0
1	1093	1098	+	CCACG	5.163332	3.0
1	-95	-90	-	CGTGG	5.163332	3.0
1	-724	-719	-	CGTGG	5.163332	3.0
1	598	603	+	ACACG	5.011329	7.5
1	-778	-773	-	CGTGT	5.011329	7.5
1	1073	1078	+	ACACG	5.011329	7.5
1	-645	-640	-	CGTGT	5.011329	7.5
1	623	628	+	CCAGG	4.703900	12.5

Plot logos with python `logomaker`¶

Description¶

Requirements¶

Data import¶

Logo plots¶

Extract sequence probabilities¶

Variant 1¶

Variant 2¶

Session info¶

	A	C	G	U
pos
0	5.0	9.0	7.0	5.0
1	4.0	13.0	9.0	0.0
2	8.0	6.0	6.0	6.0
3	5.0	6.0	5.0	10.0
4	4.0	9.0	8.0	5.0

Plot logos with python logomaker¶

Description¶

Requirements¶

Data import¶

Logo plots¶

Extract sequence probabilities¶

Variant 1¶

Variant 2¶

Session info¶

Plot logos with python `logomaker`¶