diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py
index 3e75702f6d..939676bbf2 100644
--- a/apps/common/src/python/mediawords/util/mail.py
+++ b/apps/common/src/python/mediawords/util/mail.py
@@ -13,6 +13,8 @@
 # Environment variable that, when set, will prevent the package from actually sending the email
 __ENV_MAIL_DO_NO_SEND = 'MEDIACLOUD_MAIL_DO_NOT_SEND'
 
+# queue a list of test messages sent for validation
+_sent_test_messages = []
 
 class McSendEmailException(Exception):
     """send_email() exception."""
@@ -27,6 +29,10 @@ def disable_test_mode():
     del os.environ[__ENV_MAIL_DO_NO_SEND]
 
 
+def sent_test_messages():
+    return _sent_test_messages
+
+
 def test_mode_is_enabled() -> bool:
     return __ENV_MAIL_DO_NO_SEND in os.environ
 
@@ -123,6 +129,7 @@ def send_email(message: Message) -> bool:
             mime_message.attach(message_part)
 
         if test_mode_is_enabled():
+            _sent_test_messages.append(message)
             log.info("Test mode is enabled, not actually sending any email.")
             log.debug("Omitted email:\n\n%s" % mime_message.as_string())
 
diff --git a/apps/common/src/python/mediawords/util/url/__init__.py b/apps/common/src/python/mediawords/util/url/__init__.py
index 531bc26508..1f82207c5c 100644
--- a/apps/common/src/python/mediawords/util/url/__init__.py
+++ b/apps/common/src/python/mediawords/util/url/__init__.py
@@ -178,7 +178,7 @@ def normalize_url(url: str) -> str:
     url = fix_common_url_mistakes(url)
 
     try:
-        url = canonical_url(url)
+       url = canonical_url(url)
     except Exception as ex:
         raise McNormalizeURLException("Unable to get canonical URL: %s" % str(ex))
 
diff --git a/apps/common/tests/python/mediawords/util/test_mail.py b/apps/common/tests/python/mediawords/util/test_mail.py
index 11415e26c3..cbc2f79843 100644
--- a/apps/common/tests/python/mediawords/util/test_mail.py
+++ b/apps/common/tests/python/mediawords/util/test_mail.py
@@ -4,6 +4,7 @@
     Message,
     send_email,
     send_text_email,
+    sent_test_messages,
     enable_test_mode as enable_mail_test_mode,
     disable_test_mode as disable_mail_test_mode,
 )
@@ -29,6 +30,10 @@ def test_send_mail(self):
         )
         assert send_email(message)
 
+        sent_message = sent_test_messages().pop()
+
+        assert sent_message == message
+
     def test_send_text_email(self):
         assert send_text_email(
             to='nowhere@mediacloud.org',
diff --git a/apps/tools/bin/dev/jumpstart_perl_to_python.pl b/apps/tools/bin/dev/jumpstart_perl_to_python.pl
index c59879d842..4dcce4317e 100755
--- a/apps/tools/bin/dev/jumpstart_perl_to_python.pl
+++ b/apps/tools/bin/dev/jumpstart_perl_to_python.pl
@@ -119,6 +119,12 @@ sub main
     # eq -> ==
     $code =~ s/ eq / == /g;
 
+    # undef to None
+    $code =~ s/undef/None/g;
+
+    # add paerns to common db methods
+    $code =~ s/(hash(es)?|flat)$/$1()/;
+
     print $code;
 }
 
diff --git a/apps/topics-base/src/python/topics_base/alert.py b/apps/topics-base/src/python/topics_base/alert.py
new file mode 100644
index 0000000000..6472429676
--- /dev/null
+++ b/apps/topics-base/src/python/topics_base/alert.py
@@ -0,0 +1,33 @@
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+import mediawords.util.mail
+import topics_base.config
+import topics_base.messages
+
+def send_topic_alert(db, topic, message):
+    """ send an alert about significant activity on the topic to all users with at least write access to the topic"""
+
+    emails = db.query(
+        """
+        select distinct au.email
+            from auth_users au
+                join topic_permissions tp using (auth_users_id)
+            where
+                tp.permission in ('admin', 'write') and
+                tp.topics_id = %(a)s
+        """,
+        {'a': topic['topics_id']}).flat()
+
+    emails.extend(topics_base.config.TopicsBaseConfig.topic_alert_emails())
+
+    emails = set(emails)
+
+    for email in emails:
+        message = topics_base.messages.TopicSpiderUpdateMessage(
+                to=email,
+                topic_name=topic['name'],
+                topic_url="https://topics.mediacloud.org/#/topics/topic['topics_id']/summary",
+                topic_spider_status=message,
+        )
+        mediawords.util.mail.send_email(message)
diff --git a/apps/topics-base/tests/python/test_alert.py b/apps/topics-base/tests/python/test_alert.py
new file mode 100644
index 0000000000..e075dd7d2b
--- /dev/null
+++ b/apps/topics-base/tests/python/test_alert.py
@@ -0,0 +1,53 @@
+import hashlib
+
+from mediawords.db import connect_to_db
+import mediawords.test.db.create
+import mediawords.util.mail
+import topics_base.alert
+from topics_base.config import TopicsBaseConfig
+
+from mediawords.util.log import create_logger
+
+log = create_logger(__name__)
+
+def _create_permission(db, topic, permission):
+     au = {
+         'email': f'{permission}@bar.com',
+         'password_hash': 'x' * 137,
+         'full_name': 'foo bar'}
+     au = db.create('auth_users', au)
+
+     tp = {
+         'topics_id': topic['topics_id'],
+         'auth_users_id': au['auth_users_id'],
+         'permission': permission}
+     tp = db.create('topic_permissions', tp)
+
+     return au
+
+
+def test_topic_alert():
+    db = mediawords.db.connect_to_db()
+
+    topic = mediawords.test.db.create.create_test_topic(db, 'test')
+
+    au_admin = _create_permission(db, topic, 'admin')
+    au_read = _create_permission(db, topic, 'read')
+    au_write = _create_permission(db, topic, 'write')
+
+    mediawords.util.mail.enable_test_mode()
+
+    test_message = 'foobarbat'
+
+    topics_base.alert.send_topic_alert(db, topic, test_message)
+    
+    sent_mails = mediawords.util.mail.sent_test_messages()
+
+    expected_emails = [au['email'] for au in (au_admin, au_write)] + TopicsBaseConfig.topic_alert_emails()
+    got_emails = [m.to[0] for m in sent_mails]
+
+    assert len(sent_mails) == len(expected_emails)
+
+    assert set(got_emails) == set(expected_emails)
+
+    
diff --git a/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py b/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py
index 949643f11f..23d154ba83 100644
--- a/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py
+++ b/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py
@@ -73,6 +73,9 @@ def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]:
         "select * from downloads where stories_id = %(a)s order by stories_id limit 1",
         {'a': story['stories_id']}).hash()
 
+    if not download:
+        return []
+
     html = fetch_content(db, download)
 
     soup = BeautifulSoup(html, 'lxml')
diff --git a/apps/topics-mine-public/Dockerfile b/apps/topics-mine-public/Dockerfile
index 0735388f3e..d403c3d309 100644
--- a/apps/topics-mine-public/Dockerfile
+++ b/apps/topics-mine-public/Dockerfile
@@ -9,4 +9,4 @@ COPY bin /opt/mediacloud/bin
 
 USER mediacloud
 
-CMD ["topics_mine_public_worker.pl"]
+CMD ["topics_mine_public_worker.py"]
diff --git a/apps/topics-mine-public/bin/topics_mine_public_worker.pl b/apps/topics-mine-public/bin/topics_mine_public_worker.pl
deleted file mode 100755
index 724e8cd641..0000000000
--- a/apps/topics-mine-public/bin/topics_mine_public_worker.pl
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env perl
-#
-# This job is a copy of MineTopic but is used to run a separate job queue for topics requested by public users.
-#
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use MediaWords::TM::Worker;
-
-
-sub main()
-{
-    MediaWords::TM::Worker::start_topics_mine_worker( 'MediaWords::Job::TM::MineTopicPublic' );
-}
-
-main();
diff --git a/apps/topics-mine-public/bin/topics_mine_public_worker.py b/apps/topics-mine-public/bin/topics_mine_public_worker.py
new file mode 100755
index 0000000000..ac8b58b8c0
--- /dev/null
+++ b/apps/topics-mine-public/bin/topics_mine_public_worker.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+from mediawords.job import JobBroker
+from mediawords.util.log import create_logger
+from topics_mine.mine import run_worker_job
+
+log = create_logger(__name__)
+
+QUEUE_NAME = 'MediaWords::Job::TM::MineTopicPublic'
+
+if __name__ == '__main__':
+    app = JobBroker(queue_name=QUEUE_NAME)
+    app.start_worker(handler=run_worker_job)
diff --git a/apps/topics-mine/Dockerfile b/apps/topics-mine/Dockerfile
index 16296056cc..4df95fedaf 100644
--- a/apps/topics-mine/Dockerfile
+++ b/apps/topics-mine/Dockerfile
@@ -4,23 +4,22 @@
 
 FROM gcr.io/mcback/topics-base:latest
 
-# Install Perl dependencies
-COPY src/cpanfile /var/tmp/
+# Install Python dependencies
+COPY src/requirements.txt /var/tmp/
 RUN \
     cd /var/tmp/ && \
-    cpm install --global --resolver 02packages --no-prebuilt --mirror "$MC_PERL_CPAN_MIRROR" && \
-    rm cpanfile && \
-    rm -rf /root/.perl-cpm/ && \
+    pip3 install -r requirements.txt && \
+    rm requirements.txt && \
+    rm -rf /root/.cache/ && \
     true
 
 # Copy sources
 COPY src/ /opt/mediacloud/src/topics-mine/
-ENV PERL5LIB="/opt/mediacloud/src/topics-mine/perl:${PERL5LIB}" \
-    PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}"
+ENV PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}"
 
 # Copy worker script
 COPY bin /opt/mediacloud/bin
 
 USER mediacloud
 
-CMD ["topics_mine_worker.pl"]
+CMD ["topics_mine_worker.py"]
diff --git a/apps/topics-mine/bin/mine_topic.pl b/apps/topics-mine/bin/mine_topic.pl
deleted file mode 100755
index 14c275fe35..0000000000
--- a/apps/topics-mine/bin/mine_topic.pl
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env perl
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use Getopt::Long;
-
-use MediaWords::DB;
-use MediaWords::TM::CLI;
-use MediaWords::TM::Mine;
-
-sub main
-{
-    my ( $topic_opt, $import_only, $skip_post_processing, $snapshots_id, $resume_snapshot );
-
-    binmode( STDOUT, 'utf8' );
-    binmode( STDERR, 'utf8' );
-
-    $| = 1;
-
-    Getopt::Long::GetOptions(
-        "topic=s"               => \$topic_opt,
-        "import_only!"          => \$import_only,
-        "resume_snapshot!"      => \$resume_snapshot,
-        "skip_post_processing!" => \$skip_post_processing,
-        "snapshots_id=i"        => \$snapshots_id
-    ) || return;
-
-    my $args_list = [ qw(import_only skip_post_processing snapshots_id resume_snapshot) ];
-    my $optional_args = join( ' ', map { "[ --$_ ]" } @{ $args_list } );
-    die( "usage: $0 --topic < id > $optional_args" ) unless ( $topic_opt );
-
-    my $db = MediaWords::DB::connect_to_db();
-    my $topics = MediaWords::TM::CLI::require_topics_by_opt( $db, $topic_opt );
-    unless ( $topics )
-    {
-        die "Unable to find topics for option '$topic_opt'";
-    }
-
-    for my $topic ( @{ $topics } )
-    {
-        my $topics_id = $topic->{ topics_id };
-        INFO "Processing topic $topics_id...";
-
-        if ( $resume_snapshot )
-        {
-            ( $snapshots_id ) = $db->query( <<SQL, $topics_id )->flat();
-select * from snapshots where topics_id = ? order by snapshots_id desc limit 1
-SQL
-            die( "no snapshot found for topic $topic->{ topics_id }" ) unless ( $snapshots_id );
-        }
-
-        my $args = {
-            topics_id            => $topics_id,
-            import_only          => $import_only,
-            skip_post_processing => $skip_post_processing,
-            snapshots_id         => $snapshots_id,
-        };
-
-        MediaWords::TM::Mine::mine_topic( $db, $topic, $args );
-
-        INFO "Done processing topic $topics_id.";
-    }
-}
-
-main();
diff --git a/apps/topics-mine/bin/mine_topic.py b/apps/topics-mine/bin/mine_topic.py
new file mode 100755
index 0000000000..de0084c1ed
--- /dev/null
+++ b/apps/topics-mine/bin/mine_topic.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from mediawords.db import connect_to_db
+from topics_mine.mine import mine_topic
+
+def main():
+    """run mine_topic with cli args."""
+    parser = argparse.ArgumentParser(description="Run topics_mine job.")
+    parser.add_argument("-t", "--topics_id", type=int, required=True)
+    parser.add_argument("-s", "--snapshots_id", type=int, required=False)
+    parser.add_argument("-r", "--resume_snapshot", type=bool, required=False)
+    parser.add_argument("-i", "--import_only", type=bool, required=False)
+    parser.add_argument("-p", "--skip_post_processing", type=bool, required=False)
+    args = parser.parse_args()
+
+    snapshots_id = args.snapshots_id
+    if args.resume_snapshot:
+        snapshots_id = db.query(
+            "select snapshots_id from snapshots where topics_id = %(a)s order by snapshots_id desc limit 1",
+            {'a': args.topics_id}).flat()[0]
+
+
+    db = connect_to_db()
+
+    topic = db.require_by_id('topics', args.topics_id)
+
+    mine_topic(
+        db=db,
+        topic=topic,
+        snapshots_id=snapshots_id,
+        import_only=args.import_only,
+        skip_post_processing=args.skip_post_processing)
+
+main()
diff --git a/apps/topics-mine/bin/topics_mine_worker.pl b/apps/topics-mine/bin/topics_mine_worker.pl
deleted file mode 100755
index 7f4636ebe5..0000000000
--- a/apps/topics-mine/bin/topics_mine_worker.pl
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env perl
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use MediaWords::TM::Worker;
-
-
-sub main()
-{
-    MediaWords::TM::Worker::start_topics_mine_worker( 'MediaWords::Job::TM::MineTopic' );
-}
-
-main();
diff --git a/apps/topics-mine/bin/topics_mine_worker.py b/apps/topics-mine/bin/topics_mine_worker.py
new file mode 100755
index 0000000000..3eb0963105
--- /dev/null
+++ b/apps/topics-mine/bin/topics_mine_worker.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+
+from mediawords.job import JobBroker
+from mediawords.util.log import create_logger
+from topics_mine.mine import run_worker_job
+
+log = create_logger(__name__)
+
+QUEUE_NAME = 'MediaWords::Job::TM::MineTopic'
+
+if __name__ == '__main__':
+    app = JobBroker(queue_name=QUEUE_NAME)
+    app.start_worker(handler=run_worker_job)
diff --git a/apps/topics-mine/docker-compose.tests.yml b/apps/topics-mine/docker-compose.tests.yml
index ed2e90a371..ec6299230e 100644
--- a/apps/topics-mine/docker-compose.tests.yml
+++ b/apps/topics-mine/docker-compose.tests.yml
@@ -42,6 +42,7 @@ services:
             - postgresql-pgbouncer
             - rabbitmq-server
             - topics-fetch-link
+            - topics-fetch-twitter-urls
             - topics-extract-story-links
             # 1) test_topics_mine.t calls topics-fetch-link
             # 2) topics-fetch-link calls _try_fetch_topic_url()
@@ -49,6 +50,9 @@ services:
             # 4) generate_story() calls _extract_story()
             # 5) _extract_story() runs a remote extraction job
             - extract-and-vector
+            - solr-shard-01
+            - import-solr-data-for-testing
+            - facebook-fetch-story-stats
 
     extract-and-vector:
         image: gcr.io/mcback/extract-and-vector:latest
@@ -127,6 +131,27 @@ services:
               source: ./../rabbitmq-server/conf/
               target: /etc/rabbitmq/
 
+    topics-fetch-twitter-urls:
+        image: dockermediacloud/topics-fetch-twitter-urls:latest
+        init: true
+        stop_signal: SIGKILL
+        volumes:
+            - type: bind
+              source: ./../topics-fetch-twitter-urls/bin/
+              target: /opt/mediacloud/bin/
+            - type: bind
+              source: ./../topics-fetch-twitter-urls/src/
+              target: /opt/mediacloud/src/topics-fetch-twitter-urls/
+            - type: bind
+              source: ./../topics-base/src/
+              target: /opt/mediacloud/src/topics-base/
+            - type: bind
+              source: ./../common/src/
+              target: /opt/mediacloud/src/common/
+        depends_on:
+            - postgresql-pgbouncer
+            - rabbitmq-server
+
     topics-fetch-link:
         image: gcr.io/mcback/topics-fetch-link:latest
         init: true
@@ -174,3 +199,83 @@ services:
             - postgresql-pgbouncer
             # Uses extractor HTTP service directly to get raw extracted HTML:
             - extract-article-from-page
+
+    import-solr-data-for-testing:
+        image: dockermediacloud/import-solr-data-for-testing:latest
+        init: true
+        environment:
+            MC_SOLR_IMPORT_MAX_QUEUED_STORIES: 100000
+        stop_signal: SIGKILL
+        volumes:
+            - type: bind
+              source: ./../import-solr-data-for-testing/bin/
+              target: /opt/mediacloud/bin/
+            - type: bind
+              source: ./../import-solr-data/src/
+              target: /opt/mediacloud/src/import-solr-data/
+            - type: bind
+              source: ./../common/src/
+              target: /opt/mediacloud/src/common/
+        depends_on:
+            - postgresql-pgbouncer
+            - solr-shard-01
+
+    solr-shard-01:
+        image: dockermediacloud/solr-shard:latest
+        init: true
+        stop_signal: SIGKILL
+        environment:
+            MC_SOLR_SHARD_COUNT: "1"
+        expose:
+            - 8983
+        volumes:
+            - type: bind
+              source: ./../solr-base/src/solr/
+              target: /usr/src/solr/
+            - type: bind
+              source: ./../solr-shard/bin/solr-shard.sh
+              target: /solr-shard.sh
+        depends_on:
+            - solr-zookeeper
+
+    solr-zookeeper:
+        image: dockermediacloud/solr-zookeeper:latest
+        init: true
+        stop_signal: SIGKILL
+        expose:
+            - 2181
+            - 2888
+            - 3888
+        volumes:
+            - type: bind
+              source: ./../solr-zookeeper/conf/
+              target: /opt/zookeeper/conf/
+            - type: bind
+              source: ./../solr-zookeeper/bin/zookeeper.sh
+              target: /zookeeper.sh
+
+    facebook-fetch-story-stats:
+        image: dockermediacloud/facebook-fetch-story-stats:latest
+        init: true
+        stop_signal: SIGKILL
+        environment:
+            MC_FACEBOOK_APP_ID: "IGNORE NOT NEEDED"
+            MC_FACEBOOK_APP_SECRET: "IGNORE NOT NEEEDED"
+        volumes:
+            - type: bind
+              source: ./../facebook-fetch-story-stats/bin/
+              target: /opt/mediacloud/bin/
+            - type: bind
+              source: ./../facebook-fetch-story-stats/src/
+              target: /opt/mediacloud/src/facebook-fetch-story-stats/
+            - type: bind
+              source: ./../facebook-fetch-story-stats/tests/
+              target: /opt/mediacloud/tests/
+            - type: bind
+              source: ./../common/src/
+              target: /opt/mediacloud/src/common/
+        depends_on:
+            - postgresql-pgbouncer
+            - rabbitmq-server
+
+
diff --git a/apps/topics-mine/src/cpanfile b/apps/topics-mine/src/cpanfile
deleted file mode 100644
index dca604e1ea..0000000000
--- a/apps/topics-mine/src/cpanfile
+++ /dev/null
@@ -1,3 +0,0 @@
-requires 'Date::Format';
-requires 'Text::Lorem::More';
-requires 'Time::Piece';
diff --git a/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm b/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm
deleted file mode 100644
index 2f521c07f8..0000000000
--- a/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm
+++ /dev/null
@@ -1,36 +0,0 @@
-package MediaWords::Util::Config::TopicsMine;
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-
-# Deliberately don't include MediaWords::CommonLibs as it includes this package itself
-
-{
-    package MediaWords::Util::Config::TopicsMine::PythonProxy;
-
-    use strict;
-    use warnings;
-
-    use Modern::Perl "2015";
-    use MediaWords::CommonLibs;
-
-    use MediaWords::Util::Python;
-
-    MediaWords::Util::Python::import_python_module( __PACKAGE__, 'topics_mine.config' );
-
-    1;
-}
-
-sub _python_config()
-{
-    return MediaWords::Util::Config::TopicsMine::PythonProxy::TopicsMineConfig->new();
-}
-
-sub crimson_hexagon_api_key()
-{
-    return _python_config()->crimson_hexagon_api_key();
-}
-
-1;
diff --git a/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm b/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm
deleted file mode 100644
index 7bb4f454d7..0000000000
--- a/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm
+++ /dev/null
@@ -1,11 +0,0 @@
-package MediaWords::TM::FetchTopicPosts;
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-import_python_module( __PACKAGE__, 'topics_mine.fetch_topic_posts' );
-
-1;
diff --git a/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm b/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm
deleted file mode 100644
index 4d0919870c..0000000000
--- a/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm
+++ /dev/null
@@ -1,1219 +0,0 @@
-package MediaWords::TM::Mine;
-
-=head1 NAME
-
-MediaWords::TM::Mine - topic spider implementation
-
-=head1 SYNOPSIS
-
-    MediaWords::TM::Mine::mine_topic( $db, $options );
-
-=head1 DESCRIPTION
-
-The topic mining process is described in doc/topic_mining.markdown.
-
-=cut
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use Getopt::Long;
-use List::MoreUtils;
-use List::Util;
-use Readonly;
-use Time::Piece;
-
-use MediaWords::TM::Alert;
-use MediaWords::TM::FetchTopicPosts;
-use MediaWords::TM::Stories;
-use MediaWords::DBI::Stories;
-use MediaWords::DBI::Stories::GuessDate;
-use MediaWords::Job::Broker;
-use MediaWords::Job::StatefulBroker;
-use MediaWords::Solr;
-use MediaWords::Solr::Query;
-use MediaWords::Util::SQL;
-
-# total time to wait for fetching of social media metrics
-Readonly my $MAX_SOCIAL_MEDIA_FETCH_TIME => ( 60 * 60 * 24 );
-
-# add new links in chunks of this size
-Readonly my $ADD_NEW_LINKS_CHUNK_SIZE => 10_000;
-
-# extract story links in chunks of this size
-Readonly my $EXTRACT_STORY_LINKS_CHUNK_SIZE => 1000;
-
-# query this many topic_links at a time to spider
-Readonly my $SPIDER_LINKS_CHUNK_SIZE => 100_000;
-
-# die if the error rate for link fetch or link extract jobs is greater than this
-Readonly my $MAX_JOB_ERROR_RATE => 0.02;
-
-# timeout when polling for jobs to finish
-Readonly my $JOB_POLL_TIMEOUT => 600;
-
-# number of seconds to wait when polling for jobs to finish
-Readonly my $JOB_POLL_WAIT => 5;
-
-# if more than this many seed urls are imported, dedup stories before as well as after spidering
-Readonly my $MIN_SEED_IMPORT_FOR_PREDUP_STORIES => 50_000;
-
-# how many link extraction jobs per 1000 can we ignore if they hang
-Readonly my $MAX_LINK_EXTRACTION_TIMEOUT => 10;
-
-# how long to wait to timeout link extraction
-Readonly my $LINK_EXTRACTION_POLL_TIMEOUT => 600;
-
-# if mine_topic is run with the test_mode option, set this true and do not try to queue extractions
-my $_test_mode;
-
-# update topics.state in the database
-sub update_topic_state($$$)
-{
-    my ( $db, $state_updater, $message ) = @_;
-
-    INFO( "update topic state: $message" );
-
-    unless ( $state_updater ) {
-        # Shouldn't happen but let's just test it here
-        ERROR "State updater is unset.";
-        return;
-    }
-
-    eval {
-        $state_updater->update_job_state_message( $db, $message );
-    };
-    if ( $@ )
-    {
-        die "Error updating job state: $@";
-    }
-}
-
-# return true if the publish date of the story is within 7 days of the topic date range or if the
-# story is undateable
-sub story_within_topic_date_range
-{
-    my ( $db, $topic, $story ) = @_;
-
-    return 1 unless ( $story->{ publish_date } );
-
-    my $story_date = substr( $story->{ publish_date }, 0, 10 );
-
-    my $start_date = $topic->{ start_date };
-    $start_date = MediaWords::Util::SQL::increment_day( $start_date, -7 );
-    $start_date = substr( $start_date, 0, 10 );
-
-    my $end_date = $topic->{ end_date };
-    $end_date = MediaWords::Util::SQL::increment_day( $end_date, 7 );
-    $end_date = substr( $end_date, 0, 10 );
-
-    return 1 if ( ( $story_date ge $start_date ) && ( $story_date le $end_date ) );
-
-    return MediaWords::DBI::Stories::GuessDate::is_undateable( $db, $story );
-}
-
-# submit jobs to extract links from the given stories and then poll to wait for the stories to be processed within
-# the jobs pool
-sub generate_topic_links
-{
-    my ( $db, $topic, $stories ) = @_;
-
-    INFO "generate topic links: " . scalar( @{ $stories } );
-
-    my $topic_links = [];
-
-    if ( $topic->{ platform } ne 'web' )
-    {
-        INFO( "skip link generation for non web topic" );
-        return;
-    }
-
-    my $stories_ids_table = $db->get_temporary_ids_table( [ map { $_->{ stories_id } } @{ $stories } ] );
-
-    $db->query( <<SQL, $topic->{ topics_id } );
-update topic_stories set link_mined = 'f'
-        where
-            stories_id in ( select id from $stories_ids_table ) and
-            topics_id = ? and
-            link_mined = 't'
-SQL
-
-    my $queued_stories_ids = [];
-    for my $story ( @{ $stories } )
-    {
-        next unless ( story_within_topic_date_range( $db, $topic, $story ) );
-
-        push( @{ $queued_stories_ids }, $story->{ stories_id } );
-
-        MediaWords::Job::Broker->new( 'MediaWords::Job::TM::ExtractStoryLinks' )->add_to_queue(
-            { stories_id => $story->{ stories_id }, topics_id => $topic->{ topics_id } },   #
-        );
-
-        TRACE( "queued link extraction for story $story->{ title } $story->{ url }." );
-    }
-
-    INFO( "waiting for " . scalar( @{ $queued_stories_ids } ) . " link extraction jobs to finish" );
-
-    my $queued_ids_table = $db->get_temporary_ids_table( $queued_stories_ids );
-
-    # poll every $JOB_POLL_WAIT seconds waiting for the jobs to complete.  die if the number of stories left to process
-    # has not shrunk for $EXTRACTION_POLL_TIMEOUT seconds. 
-    my $prev_num_queued_stories = scalar( @{ $stories } );
-    my $last_change_time        = time();
-    while ( 1 )
-    {
-        my $queued_stories = $db->query( <<SQL, $topic->{ topics_id } )->flat();
-select stories_id from topic_stories
-    where stories_id in ( select id from $queued_ids_table ) and topics_id = ? and link_mined = 'f'
-SQL
-
-        my $num_queued_stories = scalar( @{ $queued_stories } );
-
-        last unless ( $num_queued_stories );
-
-        $last_change_time = time() if ( $num_queued_stories != $prev_num_queued_stories );
-        if ( ( time() - $last_change_time ) > $LINK_EXTRACTION_POLL_TIMEOUT )
-        {
-            my $ids_list = join( ', ', @{ $queued_stories } );
-            if ( $num_queued_stories > $MAX_LINK_EXTRACTION_TIMEOUT )
-            {
-                LOGDIE( "Timed out waiting for story link extraction ($ids_list)." );
-            }
-
-            $db->query( <<SQL, $topic->{ topics_id } );
-update topic_stories set link_mine_error = 'time out' where stories_id in ( $ids_list ) and topics_id = ?
-SQL
-            last;
-        }
-
-        INFO( "$num_queued_stories stories left in link extraction pool...." );
-
-        $prev_num_queued_stories = $num_queued_stories;
-        sleep( $JOB_POLL_WAIT );
-    }
-
-    $db->query( <<SQL, $topic->{ topics_id } );
-update topic_stories set link_mined = 't'
-    where stories_id in ( select id from $stories_ids_table ) and topics_id = ? and link_mined = 'f'
-SQL
-    $db->query( "drop table $stories_ids_table" );
-}
-
-# die() with an appropriate error if topic_stories > topics.max_stories; because this check is expensive and we don't
-# care if the topic goes over by a few thousand stories, we only actually run the check randmly 1/1000 of the time
-sub die_if_max_stories_exceeded($$)
-{
-    my ( $db, $topic ) = @_;
-
-    my ( $num_topic_stories ) = $db->query( <<SQL, $topic->{ topics_id } )->flat;
-select count(*) from topic_stories where topics_id = ?
-SQL
-
-    if ( $num_topic_stories > $topic->{ max_stories } )
-    {
-        LOGDIE( "topic has $num_topic_stories stories, which exceeds topic max stories of $topic->{ max_stories }" );
-    }
-}
-
-# add the topic_fetch_url to the fetch_link job queue.  try repeatedly on failure.
-sub queue_topic_fetch_url($;$)
-{
-    my ( $tfu, $domain_timeout ) = @_;
-
-    $domain_timeout //= $_test_mode ? 0 : undef;
-
-    MediaWords::Job::Broker->new( 'MediaWords::Job::TM::FetchLink' )->add_to_queue(
-        {
-            topic_fetch_urls_id => $tfu->{ topic_fetch_urls_id },
-            domain_timeout      => $domain_timeout
-        }
-    );
-}
-
-# create topic_fetch_urls rows correpsonding to the links and queue a FetchLink job for each.  return the tfu rows.
-sub create_and_queue_topic_fetch_urls($$$)
-{
-    my ( $db, $topic, $fetch_links ) = @_;
-
-    my $tfus = [];
-    for my $link ( @{ $fetch_links } )
-    {
-        if ( $link->{ topic_links_id } && !$db->find_by_id( 'topic_links', $link->{ topic_links_id } ) )
-        {
-            next;
-        }
-        my $tfu = $db->create(
-            'topic_fetch_urls',
-            {
-                topics_id      => $topic->{ topics_id },
-                url            => $link->{ url },
-                state          => 'pending',
-                assume_match   => MediaWords::Util::Python::normalize_boolean_for_db( $link->{ assume_match } ),
-                topic_links_id => $link->{ topic_links_id },
-            }
-        );
-        push( @{ $tfus }, $tfu );
-
-        queue_topic_fetch_url( $tfu );
-    }
-
-    return $tfus;
-}
-
-sub _fetch_twitter_urls($$$)
-{
-    my ( $db, $topic, $tfu_ids_list ) = @_;
-
-    # we run into quota limitations with twitter sometimes and need a longer timeout
-    my $twitter_poll_timeout = $JOB_POLL_TIMEOUT * 5;
-
-    my $twitter_tfu_ids = $db->query( <<SQL )->flat();
-select topic_fetch_urls_id
-    from topic_fetch_urls tfu
-    where
-        tfu.state = 'tweet pending' and
-        tfu.topic_fetch_urls_id in ( $tfu_ids_list )
-SQL
-
-    return unless ( scalar( @{ $twitter_tfu_ids } ) > 0 );
-
-    my $tfu_ids_table = $db->get_temporary_ids_table( $twitter_tfu_ids );
-
-    MediaWords::Job::Broker->new( 'MediaWords::Job::TM::FetchTwitterUrls' )->add_to_queue(
-        { topic_fetch_urls_ids => $twitter_tfu_ids }
-    );
-
-    INFO( "waiting for fetch twitter urls job for " . scalar( @{ $twitter_tfu_ids } ) . " urls" );
-
-    # poll every $sleep_time seconds waiting for the jobs to complete.  die if the number of stories left to process
-    # has not shrunk for $large_timeout seconds.  warn but continue if the number of stories left to process
-    # is only 5% of the total and short_timeout has passed (this is to make the topic not hang entirely because
-    # of one link extractor job error).
-    my $prev_num_queued_urls = scalar( @{ $twitter_tfu_ids } );
-    my $last_change_time     = time();
-    while ( 1 )
-    {
-        my $queued_tfus = $db->query( <<SQL )->hashes();
-select tfu.*
-    from topic_fetch_urls tfu
-        join $tfu_ids_table ids on ( tfu.topic_fetch_urls_id = ids.id )
-    where
-        state in ('tweet pending')
-SQL
-
-        my $num_queued_urls = scalar( @{ $queued_tfus } );
-
-        last if ( $num_queued_urls == 0 );
-
-        $last_change_time = time() if ( $num_queued_urls != $prev_num_queued_urls );
-        if ( ( time() - $last_change_time ) > $twitter_poll_timeout )
-        {
-            LOGDIE( "Timed out waiting for twitter fetching.\n" . Dumper( $queued_tfus ) );
-        }
-
-        INFO( "$num_queued_urls twitter urls left to fetch ..." );
-
-        $prev_num_queued_urls = $num_queued_urls;
-        sleep( $JOB_POLL_WAIT );
-    }
-}
-
-# list a sample of the pending urls for fetching
-sub show_pending_urls($)
-{
-    my ( $pending_urls ) = @_;
-
-    my $num_pending_urls = scalar( @{ $pending_urls } );
-
-    my $num_printed_urls = List::Util::min( $num_pending_urls, 3 );
-
-    my @shuffled_ids = List::Util::shuffle( 0 .. ( $num_pending_urls - 1 ) );
-
-    for my $id ( @shuffled_ids[ 0 .. ( $num_printed_urls - 1 ) ] )
-    {
-        my $url = $pending_urls->[ $id ];
-        INFO( "pending url: $url->{ url } [$url->{ state }: $url->{ fetch_date }]" );
-    }
-}
-
-# fetch the given links by creating topic_fetch_urls rows and sending them to the FetchLink queue
-# for processing.  wait for the queue to complete and returnt the resulting topic_fetch_urls.
-sub fetch_links
-{
-    my ( $db, $topic, $fetch_links ) = @_;
-
-    INFO( "fetch_links: queue links" );
-    my $tfus = create_and_queue_topic_fetch_urls( $db, $topic, $fetch_links );
-    my $num_queued_links = scalar( @{ $fetch_links } );
-
-    INFO( "waiting for fetch link queue: $num_queued_links queued" );
-
-    my $tfu_ids_list = join( ',', map { int( $_->{ topic_fetch_urls_id } ) } @{ $tfus } );
-
-    my $requeues         = 0;
-    my $max_requeues     = 1;
-    my $max_requeue_jobs = 100;
-    my $requeue_timeout  = 30;
-    my $instant_requeued = 0;
-
-    # once the pool is this small, just requeue everything with a 0 per site throttle
-    my $instant_queue_size = 25;
-
-    # how many times to requeues everything if there is no change for $JOB_POLL_TIMEOUT seconds
-    my $full_requeues     = 0;
-    my $max_full_requeues = 1;
-
-    my $last_pending_change   = time();
-    my $last_num_pending_urls = 0;
-    while ( 1 )
-    {
-        my $pending_urls = $db->query( <<SQL )->hashes();
-select *, coalesce( fetch_date::text, 'null' ) fetch_date
-    from topic_fetch_urls
-    where
-        topic_fetch_urls_id in ( $tfu_ids_list ) and
-        state in ( 'pending', 'requeued' )
-SQL
-
-        my $pending_url_ids = [ map { $_->{ topic_fetch_urls_id } } @{ $pending_urls } ];
-
-        my $num_pending_urls = scalar( @{ $pending_url_ids } );
-
-        INFO( "waiting for fetch link queue: $num_pending_urls links remaining ..." );
-
-        show_pending_urls( $pending_urls );
-
-        last if ( $num_pending_urls < 1 );
-
-        # if we only have a handful of job left, requeue them all once with a 0 domain throttle
-        if ( !$instant_requeued && ( $num_pending_urls <= $instant_queue_size ) )
-        {
-            $instant_requeued = 1;
-            map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ), 0 ) } @{ $pending_url_ids };
-            sleep( $JOB_POLL_WAIT );
-            next;
-        }
-
-        my $time_since_change = time() - $last_pending_change;
-
-        # for some reason, the fetch_link queue is occasionally losing a small number of jobs.
-        if (   ( $time_since_change > $requeue_timeout )
-            && ( $requeues < $max_requeues )
-            && ( $num_pending_urls < $max_requeue_jobs ) )
-        {
-            INFO( "requeueing fetch_link $num_pending_urls jobs ... [requeue $requeues]" );
-
-            # requeue with a domain_timeout of 0 so that requeued urls can ignore throttling
-            map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ), 0 ) } @{ $pending_url_ids };
-            ++$requeues;
-            $last_pending_change = time();
-        }
-
-        if ( $time_since_change > $JOB_POLL_TIMEOUT )
-        {
-            if ( $num_pending_urls > $max_requeue_jobs )
-            {
-                die( "timed out waiting for fetch_link jobs: " . scalar( @{ $pending_url_ids } ) );
-            }
-            elsif ( $full_requeues < $max_full_requeues )
-            {
-                map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ) ) } @{ $pending_url_ids };
-                ++$full_requeues;
-                $last_pending_change = time();
-            }
-            else
-            {
-                for my $id ( @{ $pending_url_ids } )
-                {
-                    $db->update_by_id( 'topic_fetch_urls', $id, { state => 'python error', message => 'timed out' } );
-                }
-                INFO( "timed out " . scalar( @{ $pending_url_ids } ) . " urls" );
-            }
-        }
-
-        $last_pending_change = time() if ( $num_pending_urls < $last_num_pending_urls );
-
-        $last_num_pending_urls = $num_pending_urls;
-
-        sleep( $JOB_POLL_WAIT );
-    }
-
-    _fetch_twitter_urls( $db, $topic, $tfu_ids_list );
-
-    INFO( "fetch_links: update topic seed urls" );
-    $db->query( <<SQL );
-update topic_seed_urls tsu
-    set stories_id = tfu.stories_id, processed = 't'
-    from topic_fetch_urls tfu
-    where
-        tfu.url = tsu.url and
-        tfu.stories_id is not null and
-        tfu.topic_fetch_urls_id in ( $tfu_ids_list ) and
-        tfu.topics_id = tsu.topics_id
-SQL
-
-    my $completed_tfus = $db->query( <<SQL )->hashes();
-select * from topic_fetch_urls where topic_fetch_urls_id in ( $tfu_ids_list )
-SQL
-
-    INFO( "completed fetch link queue" );
-
-    return $completed_tfus;
-}
-
-# download any unmatched link in new_links, add it as a story, extract it, add any links to the topic_links list.
-# each hash within new_links can either be a topic_links hash or simply a hash with a { url } field.  if
-# the link is a topic_links hash, the topic_link will be updated in the database to point ref_stories_id
-# to the new link story.  For each link, set the { story } field to the story found or created for the link.
-sub add_new_links_chunk($$$$)
-{
-    my ( $db, $topic, $iteration, $new_links ) = @_;
-
-    die_if_max_stories_exceeded( $db, $topic );
-
-    INFO( "add_new_links_chunk: fetch_links" );
-    my $topic_fetch_urls = fetch_links( $db, $topic, $new_links );
-
-    INFO( "add_new_links_chunk: mark topic links spidered" );
-    my $link_ids = [ grep { $_ } map { $_->{ topic_links_id } } @{ $new_links } ];
-    $db->query( <<SQL, $link_ids );
-update topic_links set link_spidered  = 't' where topic_links_id = any( ? )
-SQL
-}
-
-# save a row in the topic_spider_metrics table to track performance of spider
-sub save_metrics($$$$$)
-{
-    my ( $db, $topic, $iteration, $num_links, $elapsed_time ) = @_;
-
-    my $topic_spider_metric = {
-        topics_id       => $topic->{ topics_id },
-        iteration       => $iteration,
-        links_processed => $num_links,
-        elapsed_time    => $elapsed_time
-    };
-
-    $db->create( 'topic_spider_metrics', $topic_spider_metric );
-}
-
-# call add_new_links in chunks of $ADD_NEW_LINKS_CHUNK_SIZE so we don't lose too much work when we restart the spider
-sub add_new_links($$$$;$)
-{
-    my ( $db, $topic, $iteration, $new_links, $state_updater ) = @_;
-
-    INFO( "add new links" );
-
-    return unless ( @{ $new_links } );
-
-    # randomly shuffle the links because it is better for downloading (which has per medium throttling) and extraction
-    # (which has per medium locking) to distribute urls from the same media source randomly among the list of links. the
-    # link mining and solr seeding routines that feed most links to this function tend to naturally group links
-    # from the same media source together.
-    my $shuffled_links = [ List::Util::shuffle( @{ $new_links } ) ];
-
-    my $spider_progress = get_spider_progress_description( $db, $topic, $iteration, scalar( @{ $shuffled_links } ) );
-
-    my $num_links = scalar( @{ $shuffled_links } );
-    for ( my $i = 0 ; $i < $num_links ; $i += $ADD_NEW_LINKS_CHUNK_SIZE )
-    {
-        my $start_time = time;
-
-        update_topic_state( $db, $state_updater, "$spider_progress; iteration links: $i / $num_links" );
-
-        my $end = List::Util::min( $i + $ADD_NEW_LINKS_CHUNK_SIZE - 1, $#{ $shuffled_links } );
-        add_new_links_chunk( $db, $topic, $iteration, [ @{ $shuffled_links }[ $i .. $end ] ] );
-
-        my $elapsed_time = time - $start_time;
-        save_metrics( $db, $topic, $iteration, $end - $i, $elapsed_time );
-    }
-
-    mine_topic_stories( $db, $topic );
-}
-
-# find any links for the topic of this iteration or less that have not already been spidered and call
-# add_new_links on them.
-sub spider_new_links($$$;$)
-{
-    my ( $db, $topic, $iteration, $state_updater ) = @_;
-
-    while ( 1 )
-    {
-        INFO( "querying new links ..." );
-
-        $db->query( "drop table if exists _new_links" );
-
-        my $num_new_links = $db->query( <<END, $iteration, $topic->{ topics_id } )->rows();
-create temporary table _new_links as 
-    select tl.* 
-        from topic_links tl, topic_stories ts
-        where
-            tl.link_spidered = 'f' and
-            tl.stories_id = ts.stories_id and
-            ( ts.iteration <= \$1 or ts.iteration = 1000 ) and
-            ts.topics_id = \$2 and
-            tl.topics_id = \$2
-        order by random()
-END
-
-        $db->query( "create index _new_links_tl on _new_links ( topic_links_id )" );
-
-        last if ( $num_new_links < 1 );
-
-        INFO( "found $num_new_links new links" );
-
-        while ( 1 )
-        {
-            my $new_links = $db->query( "select * from _new_links limit ?", $SPIDER_LINKS_CHUNK_SIZE )->hashes();
-
-            last unless ( @{ $new_links } );
-
-            my $tl_ids_list = join( ',', map { $_->{ topic_links_id } } @{ $new_links } );
-            $db->query( "delete from _new_links where topic_links_id in ($tl_ids_list)" );
-            add_new_links( $db, $topic, $iteration, $new_links, $state_updater );
-        }   
-    }
-}
-
-# get short text description of spidering progress
-sub get_spider_progress_description($$$$)
-{
-    my ( $db, $topic, $iteration, $total_links ) = @_;
-
-    INFO( "get spider progress description" );
-
-    my $cid = $topic->{ topics_id };
-
-    my ( $total_stories ) = $db->query( <<SQL, $cid )->flat;
-select count(*) from topic_stories where topics_id = ?
-SQL
-
-    my ( $stories_last_iteration ) = $db->query( <<SQL, $cid, $iteration )->flat;
-select count(*) from topic_stories where topics_id = ? and iteration = ? - 1
-SQL
-
-    my ( $queued_links ) = $db->query( <<SQL, $cid )->flat;
-select count(*) from topic_links where topics_id = ? and link_spidered = 'f'
-SQL
-
-    return "spidering iteration: $iteration; stories last iteration / total: " .
-      "$stories_last_iteration / $total_stories; links queued: $queued_links; iteration links: $total_links";
-}
-
-# run the spider over any new links, for $num_iterations iterations
-sub run_spider($$;$)
-{
-    my ( $db, $topic, $state_updater ) = @_;
-
-    INFO( "run spider" );
-
-    # before we run the spider over links, we need to make sure links have been generated for all existing stories
-    mine_topic_stories( $db, $topic );
-
-    map { spider_new_links( $db, $topic, $topic->{ max_iterations }, $state_updater ) } ( 1 .. $topic->{ max_iterations } );
-}
-
-# mine for links any stories in topic_stories for this topic that have not already been mined
-sub mine_topic_stories
-{
-    my ( $db, $topic ) = @_;
-
-    INFO( "mine topic stories" );
-
-    # skip for non-web topic, because the below query grows very large without ever mining links
-    if ( $topic->{ platform } ne 'web' )
-    {
-        INFO( "skip link generation for non-web topic" );
-        return;
-    }
-
-    # chunk the story extractions so that one big topic does not take over the entire queue
-    my $i = 0;
-    while ( 1 )
-    {
-        $i += $EXTRACT_STORY_LINKS_CHUNK_SIZE;
-        INFO( "mine topic stories: chunked $i ..." );
-        my $stories = $db->query( <<SQL, $topic->{ topics_id }, $EXTRACT_STORY_LINKS_CHUNK_SIZE )->hashes;
-    select s.*, ts.link_mined, ts.redirect_url
-        from snap.live_stories s
-            join topic_stories ts on ( s.stories_id = ts.stories_id and s.topics_id = ts.topics_id )
-        where
-            ts.link_mined = false and
-            ts.topics_id = ?
-        limit ?
-SQL
-
-        my $num_stories = scalar( @{ $stories } );
-
-        last if ( $num_stories == 0 );
-
-        generate_topic_links( $db, $topic, $stories );
-
-        last if ( $num_stories < $EXTRACT_STORY_LINKS_CHUNK_SIZE );
-    }
-}
-
-# import all topic_seed_urls that have not already been processed;
-# return 1 if new stories were added to the topic and 0 if not
-sub import_seed_urls($$;$)
-{
-    my ( $db, $topic, $state_updater ) = @_;
-
-    INFO( "import seed urls" );
-
-    my $topics_id = $topic->{ topics_id };
-
-    # take care of any seed urls with urls that we have already processed for this topic
-    $db->query( <<END, $topics_id );
-update topic_seed_urls a set stories_id = b.stories_id, processed = 't'
-    from topic_seed_urls b
-    where a.url = b.url and
-        a.topics_id = ? and b.topics_id = a.topics_id and
-        a.stories_id is null and b.stories_id is not null
-END
-
-    # randomly shuffle this query so that we don't block the extractor pool by throwing it all
-    # stories from a single media_id at once
-    my $seed_urls = $db->query( <<END, $topics_id )->hashes;
-select * from topic_seed_urls where topics_id = ? and processed = 'f' order by random()
-END
-
-    return 0 unless ( @{ $seed_urls } );
-
-    # process these in chunks in case we have to start over so that we don't have to redo the whole batch
-    my $num_urls = scalar( @{ $seed_urls } );
-    for ( my $i = 0 ; $i < $num_urls ; $i += $ADD_NEW_LINKS_CHUNK_SIZE )
-    {
-        my $start_time = time;
-
-        update_topic_state( $db, $state_updater, "importing seed urls: $i / $num_urls" );
-
-        my $end = List::Util::min( $i + $ADD_NEW_LINKS_CHUNK_SIZE - 1, $#{ $seed_urls } );
-
-        # verify that the seed urls are still there and not processed, in case we have mucked with them while spidering
-        my $urls_ids_list = join( ',', map { int( $_->{ topic_seed_urls_id } ) } @{ $seed_urls }[ $i .. $end] );
-        my $seed_urls_chunk = $db->query( <<SQL )->hashes();
-select * from topic_seed_urls where topic_seed_urls_id in ( $urls_ids_list ) and not processed
-SQL
-
-        add_new_links_chunk( $db, $topic, 0, $seed_urls_chunk );
-
-        my $ids_list = join( ',', map { int( $_->{ topic_seed_urls_id } ) } @{ $seed_urls_chunk } );
-
-        # update topic_seed_urls that were actually fetched
-        $db->query( <<SQL );
-update topic_seed_urls tsu
-    set stories_id = tfu.stories_id
-    from topic_fetch_urls tfu
-    where
-        tsu.topics_id = tfu.topics_id and
-        md5(tsu.url) = md5(tfu.url) and
-        tsu.topic_seed_urls_id in ( $ids_list )
-SQL
-
-        # now update the topic_seed_urls that were matched
-        $db->query( <<SQL );
-update topic_seed_urls tsu
-    set processed = 't'
-    where
-        tsu.topic_seed_urls_id in ( $ids_list ) and
-        processed = 'f'
-SQL
-
-        my $elapsed_time = time - $start_time;
-        save_metrics( $db, $topic, 1, $end - $i, $elapsed_time );
-    }
-
-    # cleanup any topic_seed_urls pointing to a merged story
-    $db->query(
-        <<SQL,
-        UPDATE topic_seed_urls AS tsu
-        SET stories_id = tms.target_stories_id, processed = 't'
-        FROM topic_merged_stories_map AS tms,
-             topic_stories ts
-        WHERE tsu.stories_id = tms.source_stories_id
-          AND ts.stories_id = tms.target_stories_id
-          AND tsu.topics_id = ts.topics_id
-          AND ts.topics_id = \$1
-SQL
-        $topic->{ topics_id }
-    );
-
-    return scalar( @{ $seed_urls } );
-}
-
-
-# insert a list of topic seed urls
-sub insert_topic_seed_urls
-{
-    my ( $db, $topic_seed_urls ) = @_;
-
-    INFO "inserting " . scalar( @{ $topic_seed_urls } ) . " topic seed urls ...";
-
-    for my $tsu ( @{ $topic_seed_urls } )
-    {
-        my $insert_tsu;
-        map { $insert_tsu->{ $_ } = $tsu->{ $_ } } qw/stories_id url topics_id assume_match/;
-        $db->create( 'topic_seed_urls', $insert_tsu );
-    }        
-}
-
-# return true if the given month offset is within the dates that should be respidered.  always return true 
-# if there are not respider dates
-sub _import_month_within_respider_date($$)
-{
-    my ( $topic, $month_offset ) = @_;
-
-    my $start_date = $topic->{ respider_start_date } || '';;
-    my $end_date = $topic->{ respider_end_date } || '';
-
-    return 1 unless ( $topic->{ respider_stories } && ( $start_date || $end_date ) );
-
-    my $month_date = Time::Piece->strptime( $topic->{ start_date }, "%Y-%m-%d" )->add_months( $month_offset );
-
-    if ( $end_date )
-    {
-        my $end_date = Time::Piece->strptime( $end_date, "%Y-%m-%d" )->add_months( -1 );
-        return 1 if ( $month_date > $end_date );
-    }
-
-    if ( $start_date )
-    {
-        my $start_date = Time::Piece->strptime( $start_date, "%Y-%m-%d" );
-        return 1 if ( $month_date < $start_date );
-    }
-
-    return 0;
-}
-
-# Call search_solr_for_stories_ids() above and then query PostgreSQL for the stories returned by Solr.
-# Include stories.* and media_name as the returned fields.
-sub __search_for_stories($$)
-{
-    my ( $db, $params ) = @_;
-
-    my $stories_ids = MediaWords::Solr::search_solr_for_stories_ids( $db, $params );
-
-    my $stories = [ map { { stories_id => $_ } } @{ $stories_ids } ];
-
-    $stories = MediaWords::DBI::Stories::attach_story_meta_data_to_stories( $db, $stories );
-
-    $stories = [ grep { $_->{ url } } @{ $stories } ];
-
-    return $stories;
-}
-
-# import a single month of the solr seed query.  we do this to avoid giant queries that timeout in solr.
-sub import_solr_seed_query_month($$$)
-{
-    my ( $db, $topic, $month_offset ) = @_;
-
-    return 0 unless ( $topic->{ platform } eq 'web' );
-
-    my $solr_query = MediaWords::Solr::Query::get_full_solr_query_for_topic( $db, $topic, undef, undef, $month_offset );
-
-    # this should return undef once the month_offset gets too big
-    return undef unless ( $solr_query );
-
-    return 1 unless ( _import_month_within_respider_date( $topic, $month_offset ) );
-
-    my $max_stories = $topic->{ max_stories };
-
-    # if solr maxes out on returned stories, it returns a few documents less than the rows= parameter, so we
-    # assume that we hit the solr max if we are within 5% of the ma stories
-    my $max_returned_stories = $max_stories * 0.95;
-
-    INFO "import solr seed query month offset $month_offset";
-    $solr_query->{ rows } = $max_stories;
-
-    my $stories = __search_for_stories( $db, $solr_query );
-
-    if ( scalar( @{ $stories } ) > $max_returned_stories )
-    {
-        die( "solr_seed_query returned more than $max_returned_stories stories" );
-    }
-
-    INFO "adding " . scalar( @{ $stories } ) . " stories to topic_seed_urls";
-
-    my $topic_seed_urls = [];
-    for my $story ( @{ $stories } )
-    {
-        push(
-            @{ $topic_seed_urls },
-            {
-                topics_id    => $topic->{ topics_id },
-                url          => $story->{ url },
-                stories_id   => $story->{ stories_id },
-                assume_match => 'f'
-            }
-        );
-    }
-
-    insert_topic_seed_urls( $db, $topic_seed_urls );
-
-    return 1;
-}
-
-# import stories intro topic_seed_urls from solr by running
-# topic->{ solr_seed_query } against solr.  if the solr query has
-# already been imported, do nothing.
-sub import_solr_seed_query
-{
-    my ( $db, $topic ) = @_;
-
-    INFO( "import solr seed query" );
-
-    return if ( $topic->{ solr_seed_query_run } );
-
-    my $month_offset = 0;
-    while ( import_solr_seed_query_month( $db, $topic, $month_offset++ ) ) { }
-
-    $db->query( "update topics set solr_seed_query_run = 't' where topics_id = ?", $topic->{ topics_id } );
-}
-
-# return true if there are no stories without facebook data
-sub all_facebook_data_fetched
-{
-    my ( $db, $topic ) = @_;
-
-    my $null_facebook_story = $db->query( <<SQL, $topic->{ topics_id } )->hash;
-select 1
-    from topic_stories cs
-        left join story_statistics ss on ( cs.stories_id = ss.stories_id )
-    where
-        cs.topics_id = ? and
-        ss.facebook_api_error is null and
-        (
-            ss.stories_id is null or
-            ss.facebook_share_count is null or
-            ss.facebook_comment_count is null or
-            ss.facebook_api_collect_date is null
-        )
-    limit 1
-SQL
-
-    return !$null_facebook_story;
-}
-
-# add all topic stories without facebook data to the queue
-sub __add_topic_stories_to_facebook_queue($$)
-{
-    my ( $db, $topic ) = @_;
-
-    my $topics_id = $topic->{ topics_id };
-
-    my $stories = $db->query( <<END, $topics_id )->hashes;
-SELECT ss.*, cs.stories_id
-    FROM topic_stories cs
-        left join story_statistics ss on ( cs.stories_id = ss.stories_id )
-    WHERE cs.topics_id = ?
-    ORDER BY cs.stories_id
-END
-
-    unless ( scalar @{ $stories } )
-    {
-        DEBUG( "No stories found for topic '$topic->{ name }'" );
-    }
-
-    for my $ss ( @{ $stories } )
-    {
-        my $stories_id = $ss->{ stories_id };
-        my $args = { stories_id => $stories_id };
-
-        if (   $ss->{ facebook_api_error }
-            or !defined( $ss->{ facebook_api_collect_date } )
-            or !defined( $ss->{ facebook_share_count } )
-            or !defined( $ss->{ facebook_comment_count } ) )
-        {
-            DEBUG( "Adding job for story $stories_id" );
-            MediaWords::Job::Broker->new( 'MediaWords::Job::Facebook::FetchStoryStats' )->add_to_queue( $args );
-        }
-    }
-}
-
-# send high priority jobs to fetch facebook data for all stories that don't yet have it
-sub fetch_social_media_data ($$)
-{
-    my ( $db, $topic ) = @_;
-
-    INFO( "fetch social media data" );
-
-    # test spider should be able to run with job broker, so we skip social media collection
-    return if ( $_test_mode );
-
-    my $cid = $topic->{ topics_id };
-
-    __add_topic_stories_to_facebook_queue( $db, $topic );
-
-    my $poll_wait = 30;
-    my $retries   = int( $MAX_SOCIAL_MEDIA_FETCH_TIME / $poll_wait ) + 1;
-
-    for my $i ( 1 .. $retries )
-    {
-        return if ( all_facebook_data_fetched( $db, $topic ) );
-        sleep $poll_wait;
-    }
-
-    LOGCONFESS( "Timed out waiting for social media data" );
-}
-
-# die if the error rate for link extraction or link fetching is too high
-sub check_job_error_rate($$)
-{
-    my ( $db, $topic ) = @_;
-
-    INFO( "check job error rate" );
-
-    my $fetch_stats = $db->query( <<SQL, $topic->{ topics_id } )->hashes();
-select count(*) num, ( state = 'python error' ) as error
-    from topic_fetch_urls
-        where topics_id = ?
-        group by ( state = 'python error' )
-SQL
-
-    my ( $num_fetch_errors, $num_fetch_successes ) = ( 0, 0 );
-    for my $s ( @{ $fetch_stats } )
-    {
-        if   ( $s->{ error } ) { $num_fetch_errors    += $s->{ num } }
-        else                   { $num_fetch_successes += $s->{ num } }
-    }
-
-    my $fetch_error_rate = $num_fetch_errors / ( $num_fetch_errors + $num_fetch_successes + 1 );
-
-    INFO( "Fetch error rate: $fetch_error_rate ($num_fetch_errors / $num_fetch_successes)" );
-
-    if ( $fetch_error_rate > $MAX_JOB_ERROR_RATE )
-    {
-        die( "Fetch error rate of $fetch_error_rate is greater than max of $MAX_JOB_ERROR_RATE" );
-    }
-
-    my $link_stats = $db->query( <<SQL, $topic->{ topics_id } )->hashes();
-select count(*) num, ( length( link_mine_error) > 0 ) as error
-    from topic_stories
-        where topics_id = ?
-        group by ( length( link_mine_error ) > 0 )
-SQL
-
-    my ( $num_link_errors, $num_link_successes ) = ( 0, 0 );
-    for my $s ( @{ $link_stats } )
-    {
-        if   ( $s->{ error } ) { $num_link_errors    += $s->{ num } }
-        else                   { $num_link_successes += $s->{ num } }
-    }
-
-    my $link_error_rate = $num_link_errors / ( $num_link_errors + $num_link_successes + 1 );
-
-    INFO( "Link error rate: $link_error_rate ($num_link_errors / $num_link_successes)" );
-
-    if ( $link_error_rate > $MAX_JOB_ERROR_RATE )
-    {
-        die( "link error rate of $link_error_rate is greater than max of $MAX_JOB_ERROR_RATE" );
-    }
-}
-
-# import urls from seed query 
-sub import_urls_from_seed_queries($$;$)
-{
-    my ( $db, $topic, $state_updater ) = @_;
-    
-    my $topic_seed_queries = $db->query(
-        "select * from topic_seed_queries where topics_id = ?", $topic->{ topics_id } )->hashes();
-
-    my $num_queries = scalar( @{ $topic_seed_queries } );
-
-    if ( ( $num_queries != 1 ) && ( $topic->{ mode } eq 'url_sharing' ))
-    {
-        die( "exactly one topic seed query required per url_sharing topic" );
-    }
-
-    if ( $topic->{ mode } eq 'web' )
-    {
-        DEBUG( "import seed urls from solr" );
-        update_topic_state( $db, $state_updater, "importing solr seed query" );
-        import_solr_seed_query( $db, $topic );
-    }
-
-    for my $tsq ( @{ $topic_seed_queries } )
-    {
-        my $tsq_dump = $tsq->{ topic_seed_queries_id };
-        my $fetcher = MediaWords::TM::FetchTopicPosts::get_post_fetcher( $tsq ); 
-        die( "unable to import seed urls for platform/source of seed query: $tsq_dump" ) unless ( $fetcher );
-
-        DEBUG( "import seed urls from fetch_topic_posts:\n$tsq_dump" );
-        MediaWords::TM::FetchTopicPosts::fetch_topic_posts( $db, $tsq );
-    }
-
-    $db->query( <<SQL, $topic->{ topics_id } );
-insert into topic_seed_urls ( url, topics_id, assume_match, source, topic_seed_queries_id, topic_post_urls_id )
-    select distinct
-            tpu.url,
-            tsq.topics_id,
-            false,
-            'topic_seed_queries', 
-            tsq.topic_seed_queries_id,
-            tpu.topic_post_urls_id
-        from
-            topic_post_urls tpu
-            join topic_posts tp using ( topic_posts_id )
-            join topic_post_days tpd using ( topic_post_days_id )
-            join topic_seed_queries tsq using ( topic_seed_queries_id )
-        where
-            tsq.topics_id = ? 
-        on conflict ( topic_post_urls_id ) do nothing
-SQL
-}
-
-# if the query or dates have changed, set topic_stories.link_mined to false for the impacted stories so that
-# they will be respidered
-sub set_stories_respidering($$$)
-{
-    my ( $db, $topic, $snapshots_id ) = @_;
-
-    return unless ( $topic->{ respider_stories } );
-
-    my $respider_start_date = $topic->{ respider_start_date };
-    my $respider_end_date = $topic->{ respider_end_date };
-
-    if ( !$respider_start_date && !$respider_end_date )
-    {
-        $db->query( "update topic_stories set link_mined = 'f' where topics_id = ?", $topic->{ topics_id } );
-        return;
-    }
-
-    if ( $respider_start_date )
-    {
-        $db->query( <<SQL, $respider_start_date, $topic->{ start_date }, $topic->{ topics_id } );
-update topic_stories ts set link_mined = 'f'
-    from stories s
-    where
-        ts.stories_id = s.stories_id and
-        s.publish_date >= \$2 and 
-        s.publish_date <= \$1 and
-        ts.topics_id = \$3
-SQL
-        if ( $snapshots_id )
-        {
-            $db->update_by_id( 'snapshots', $snapshots_id, { start_date => $topic->{ start_date } } );
-            $db->query( <<SQL, $snapshots_id, $respider_start_date );
-update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null
-    where snapshots_id = ? and start_date < ?
-SQL
-        }
-    }
-
-    if ( $respider_end_date )
-    {
-        $db->query( <<SQL, $respider_end_date, $topic->{ end_date }, $topic->{ topics_id } );
-update topic_stories ts set link_mined = 'f'
-    from stories s
-    where
-        ts.stories_id = s.stories_id and
-        s.publish_date >= \$1 and 
-        s.publish_date <= \$2 and
-        ts.topics_id = \$3
-SQL
-
-        if ( $snapshots_id )
-        {
-            $db->update_by_id( 'snapshots', $snapshots_id, { end_date => $topic->{ end_date } } );
-            $db->query( <<SQL, $snapshots_id, $respider_end_date );
-update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null
-    where snapshots_id = ? and end_date > ?
-SQL
-        }
-    }
-
-    $db->update_by_id( 'topics', $topic->{ topics_id },
-        { respider_stories => 'f', respider_start_date => undef, respider_end_date => undef } );
-}
-
-
-# mine the given topic for links and to recursively discover new stories on the web.
-# options:
-#   import_only - only run import_seed_urls and import_solr_seed and exit
-#   skip_post_processing - skip social media fetching and snapshotting
-#   snapshots_id - associate topic with the given existing snapshot
-sub do_mine_topic($$;$$)
-{
-    my ( $db, $topic, $options, $state_updater ) = @_;
-
-    map { $options->{ $_ } ||= 0 } qw/import_only skip_post_processing test_mode/;
-
-    update_topic_state( $db, $state_updater, "importing seed urls" );
-    import_urls_from_seed_queries( $db, $topic, $state_updater );
-
-    update_topic_state( $db, $state_updater, "setting stories respidering..." );
-    set_stories_respidering( $db, $topic, $options->{ snapshots_id } );
-
-    # this may put entires into topic_seed_urls, so run it before import_seed_urls.
-    # something is breaking trying to call this perl.  commenting out for time being since we only need
-    # this when we very rarely change the foreign_rss_links field of a media source - hal
-    # update_topic_state( $db, $state_updater, "merging foreign rss stories" );
-    # MediaWords::TM::Stories::merge_foreign_rss_stories( $db, $topic );
-
-    update_topic_state( $db, $state_updater, "importing seed urls" );
-    if ( import_seed_urls( $db, $topic, $state_updater ) > $MIN_SEED_IMPORT_FOR_PREDUP_STORIES )
-    {
-        # merge dup stories before as well as after spidering to avoid extra spidering work
-        update_topic_state( $db, $state_updater, "merging duplicate stories" );
-        MediaWords::TM::Stories::find_and_merge_dup_stories( $db, $topic );
-    }
-
-    unless ( $options->{ import_only } )
-    {
-        update_topic_state( $db, $state_updater, "running spider" );
-        run_spider( $db, $topic, $state_updater );
-
-        check_job_error_rate( $db, $topic );
-
-        # merge dup media and stories again to catch dups from spidering
-        update_topic_state( $db, $state_updater, "merging duplicate stories" );
-        MediaWords::TM::Stories::find_and_merge_dup_stories( $db, $topic );
-
-        update_topic_state( $db, $state_updater, "merging duplicate media stories" );
-        MediaWords::TM::Stories::merge_dup_media_stories( $db, $topic );
-
-        if ( !$options->{ skip_post_processing } )
-        {
-            update_topic_state( $db, $state_updater, "fetching social media data" );
-            fetch_social_media_data( $db, $topic );
-
-            update_topic_state( $db, $state_updater, "snapshotting" );
-            my $snapshot_args = { topics_id => $topic->{ topics_id }, snapshots_id => $options->{ snapshots_id } };
-            MediaWords::Job::StatefulBroker->new( 'MediaWords::Job::TM::SnapshotTopic' )->add_to_queue( $snapshot_args );
-        }
-    }
-}
-
-# wrap do_mine_topic in eval and handle errors and state
-sub mine_topic ($$;$$)
-{
-    my ( $db, $topic, $options, $state_updater ) = @_;
-
-    # the topic spider can sit around for long periods doing solr queries, so we need to make sure the postgres
-    # connection does not get timed out
-    $db->query( "set idle_in_transaction_session_timeout = 0" );
-
-    my $prev_test_mode = $_test_mode;
-
-    $_test_mode = 1 if ( $options->{ test_mode } );
-
-    if ( $topic->{ state } ne 'running' )
-    {
-        MediaWords::TM::Alert::send_topic_alert( $db, $topic, "started topic spidering" );
-    }
-
-    eval { do_mine_topic( $db, $topic, $options, $state_updater ); };
-    if ( $@ )
-    {
-        my $error = $@;
-        MediaWords::TM::Alert::send_topic_alert( $db, $topic, "aborted topic spidering due to error" );
-        LOGDIE( $error );
-    }
-
-    $_test_mode = $prev_test_mode;
-}
-
-1;
diff --git a/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm b/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm
deleted file mode 100644
index ce467d6869..0000000000
--- a/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm
+++ /dev/null
@@ -1,95 +0,0 @@
-package MediaWords::TM::Worker;
-
-#
-# Run through stories found for the given topic and find all the links in
-# each story.
-#
-# For each link, try to find whether it matches any given story. If it doesn't,
-# create a new story. Add that story's links to the queue if it matches the
-# pattern for the topic. Write the resulting stories and links to
-# topic_stories and topic_links.
-#
-# Options:
-#
-# * dedup_stories - run story deduping code over existing topic stories;
-#   only necessary to rerun new dedup code
-#
-# * import_only - only run import_seed_urls and import_solr_seed and return
-#
-
-use strict;
-use warnings;
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use MediaWords::DB;
-use MediaWords::Job::Lock;
-use MediaWords::Job::State;
-use MediaWords::Job::State::ExtraTable;
-use MediaWords::Job::StatefulBroker;
-use MediaWords::TM::Mine;
-
-
-sub run_job($)
-{
-    my $args = shift;
-
-    my $db = MediaWords::DB::connect_to_db();
-
-    my $topics_id                       = $args->{ topics_id };
-    my $import_only                     = $args->{ import_only } // 0;
-    my $cache_broken_downloads          = $args->{ cache_broken_downloads } // 0;
-    my $skip_outgoing_foreign_rss_links = $args->{ skip_outgoing_foreign_rss_links } // 0;
-    my $skip_post_processing            = $args->{ skip_post_processing } // 0;
-    my $test_mode                       = $args->{ test_mode } // 0;
-    my $snapshots_id                    = $args->{ snapshots_id } // undef;
-
-    my $state_updater                   = $args->{ state_updater };
-
-    unless ( $topics_id )
-    {
-        die "'topics_id' is not set.";
-    }
-
-    unless ( $state_updater ) {
-        die "State updater is not set.";
-    }
-
-    my $topic = $db->find_by_id( 'topics', $topics_id )
-      or die( "Unable to find topic '$topics_id'" );
-
-    my $options = {
-        import_only                     => $import_only,
-        cache_broken_downloads          => $cache_broken_downloads,
-        skip_outgoing_foreign_rss_links => $skip_outgoing_foreign_rss_links,
-        skip_post_processing            => $skip_post_processing,
-        test_mode                       => $test_mode,
-        snapshots_id                    => $snapshots_id
-    };
-
-    MediaWords::TM::Mine::mine_topic( $db, $topic, $options, $state_updater );
-}
-
-sub start_topics_mine_worker($)
-{
-    my $queue_name = shift;
-
-    my $app = MediaWords::Job::StatefulBroker->new( $queue_name );
-
-    my $lock = MediaWords::Job::Lock->new(
-
-        # Define this here so that ::MineTopicPublic operates on the same lock
-        'MediaWords::Job::TM::MineTopic',
-
-        # Only run one job for each topic at a time
-        'topics_id',
-
-    );
-
-    my $extra_table = MediaWords::Job::State::ExtraTable->new( 'topics', 'state', 'message' );
-    my $state = MediaWords::Job::State->new( $extra_table );
-    $app->start_worker( \&run_job, $lock, $state );
-}
-
-1;
diff --git a/apps/topics-mine/src/python/topics_mine/mine.py b/apps/topics-mine/src/python/topics_mine/mine.py
new file mode 100644
index 0000000000..1588b3446d
--- /dev/null
+++ b/apps/topics-mine/src/python/topics_mine/mine.py
@@ -0,0 +1,1150 @@
+"""
+topic spider implementation
+
+this package implements the parent spider job, which runs the initial seed queries and then queues and
+manages the children jobs to fetch and extract links, to fetch social media data, and so on.
+
+the topic mining process is described in doc/topic_mining.markdown.
+"""
+
+import datetime
+from dateutil.relativedelta import relativedelta
+import random
+from time import sleep, time
+from typing import Optional, Callable
+
+from mediawords.db import DatabaseHandler
+from mediawords.db.locks import get_session_lock, release_session_lock
+import mediawords.dbi.stories
+from mediawords.job import JobBroker, StatefulJobBroker, StateUpdater
+import mediawords.solr
+import mediawords.solr.query
+import mediawords.util.sql
+import topics_base.alert
+import topics_base.stories
+import topics_mine.fetch_topic_posts
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+# lock_type to send to get_session_lock
+LOCK_TYPE = 'MediaWords::Job::TM::MineTopic'
+
+# total time to wait for fetching of social media metrics
+MAX_SOCIAL_MEDIA_FETCH_TIME = (60 * 60 * 24)
+
+# add new links in chunks of this size
+ADD_NEW_LINKS_CHUNK_SIZE = 10000
+
+# extract story links in chunks of this size
+EXTRACT_STORY_LINKS_CHUNK_SIZE = 1000
+
+# query this many topic_links at a time to spider
+SPIDER_LINKS_CHUNK_SIZE = 100000
+
+# raise McTopicMineError if the error rate for link fetch or link extract jobs is greater than this
+MAX_JOB_ERROR_RATE = 0.02
+
+# timeout when polling for jobs to finish
+JOB_POLL_TIMEOUT = 600
+
+# number of seconds to wait when polling for jobs to finish
+JOB_POLL_WAIT = 5
+
+# if more than this many seed urls are imported, dedup stories before as well as after spidering
+MIN_SEED_IMPORT_FOR_PREDUP_STORIES = 50000
+
+# how many link extraction jobs per 1000 can we ignore if they hang
+MAX_LINK_EXTRACTION_TIMEOUT = 10
+
+# how long to wait to timeout link extraction
+LINK_EXTRACTION_POLL_TIMEOUT = 600
+
+# domain timeout for link fetching
+DOMAIN_TIMEOUT = None
+
+class McTopicMineError(Exception):
+    pass
+
+
+def update_topic_state(db: DatabaseHandler, state_updater: Optional[StateUpdater], message: str) -> None:
+    """ update topics.state in the database"""
+
+    log.info("update topic state: message")
+
+    if not state_updater:
+        # Shouldn't happen but let's just test it here
+        log.warning("State updater is unset.")
+        return
+
+    state_updater.update_job_state_message(db, message)
+
+
+def story_within_topic_date_range(topic: dict, story:dict) -> bool:
+    """return True if the publish date of the story is within 7 days of the topic date range or if it is undateable"""
+
+    if not story['publish_date']:
+        return True
+
+    story_date = (story['publish_date'])[0:10]
+
+    start_date = topic['start_date']
+    start_date = mediawords.util.sql.increment_day(start_date, -7)
+    start_date = start_date[0:10]
+
+    end_date = topic['end_date']
+    end_date = mediawords.util.sql.increment_day(end_date, 7)
+    end_date = end_date[0:10]
+
+    return story_date >= start_date and story_date <= end_date
+
+
+def generate_topic_links(db: DatabaseHandler, topic: dict, stories: list):
+    """submit jobs to extract links from the stories and then poll to wait for the stories to be processed"""
+    log.info(f"generate topic links: {len(stories)}")
+
+    if len(stories) < 1:
+        return
+
+    topic_links = []
+
+    if topic['platform'] != 'web':
+        log.info("skip link generation for non web topic")
+        return
+
+    stories_ids_table = db.get_temporary_ids_table([s['stories_id'] for s in stories])
+
+    db.query(
+        f"""
+        update topic_stories set link_mined = 'f'
+            where
+                stories_id in (select id from {stories_ids_table}) and
+                topics_id = %(a)s and
+                link_mined = 't'
+        """,
+        {'a': topic['topics_id']})
+
+    queued_stories_ids = []
+    for story in stories:
+        if not story_within_topic_date_range(topic, story):
+            continue
+
+        queued_stories_ids.append(story['stories_id'])
+
+        JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks').add_to_queue(
+                stories_id=story['stories_id'], 
+                topics_id=topic['topics_id'])
+
+        log.debug(f"queued link extraction for story {story['title']} {story['url']}.")
+
+    log.info(f"waiting for {len(queued_stories_ids)} link extraction jobs to finish")
+
+    queued_ids_table = db.get_temporary_ids_table(queued_stories_ids)
+
+    # poll every JOB_POLL_WAIT seconds waiting for the jobs to complete.  raise McTopicMineError if the number
+    # of stories left to process has not shrunk for EXTRACTION_POLL_TIMEOUT seconds.
+    prev_num_queued_stories = len(stories)
+    last_change_time = time()
+    while True:
+        queued_stories = db.query(
+            f"""
+            select stories_id from topic_stories
+                where stories_id in (select id from {queued_ids_table}) and topics_id = %(a)s and link_mined = 'f'
+            """,
+            {'a': topic['topics_id']}).flat()
+
+        num_queued_stories = len(queued_stories)
+
+        if not num_queued_stories:
+            break
+
+        if num_queued_stories != prev_num_queued_stories:
+            last_change_time = time()
+
+        if (time() - last_change_time) > LINK_EXTRACTION_POLL_TIMEOUT:
+            ids_list = ','.join(queued_stories)
+            if num_queued_stories > MAX_LINK_EXTRACTION_TIMEOUT:
+                raise McTopicMineError(f"Timed out waiting for story link extraction ({ids_list}).")
+
+            db.query(
+                """
+                update topic_stories set link_mine_error = 'time out'
+                    where stories_id = any(%(b)s)  and topics_id = %(a)s
+                """,
+                {'a': topic['topics_id'], 'b': queued_stories})
+
+            break
+
+        log.info(f"{num_queued_stories} stories left in link extraction pool....")
+
+        prev_num_queued_stories = num_queued_stories
+        sleep(JOB_POLL_WAIT)
+
+    db.query(
+        f"""
+        update topic_stories set link_mined = 't'
+            where stories_id in (select id from {stories_ids_table}) and topics_id = %(a)s and link_mined = 'f'
+        """,
+        {'a': topic['topics_id']})
+
+    db.query(f"drop table {stories_ids_table}")
+
+
+def die_if_max_stories_exceeded(db: DatabaseHandler, topic: dict) -> None:
+    """
+    raise an MCTopicMineMaxStoriesException topic_stories > topics.max_stories.
+    """
+    num_topic_stories = db.query(
+        "select count(*) from topic_stories where topics_id = %(a)s",
+        {'a': topic['topics_id']}).flat()[0]
+
+    if num_topic_stories > topic['max_stories']:
+        raise McTopicMineError(f"{num_topic_stories} stories > {topic['max_stories']}")
+
+
+def queue_topic_fetch_url(tfu: dict, domainm_timeout: Optional[int] = None):
+    """ add the topic_fetch_url to the fetch_link job queue.  try repeatedly on failure."""
+
+    JobBroker(queue_name='MediaWords::Job::TM::FetchLink').add_to_queue(
+            topic_fetch_urls_id=tfu['topic_fetch_urls_id'],
+            domain_timeout=DOMAIN_TIMEOUT)
+
+
+def create_and_queue_topic_fetch_urls(db: DatabaseHandler, topic: dict, fetch_links: list) -> list:
+    """
+    create topic_fetch_urls rows correpsonding to the links and queue a FetchLink job for each.
+
+    return the tfu rows.
+    """
+    tfus = []
+    for link in fetch_links:
+        topic_links_id = link.get('topic_links_id', None)
+        assume_match = link.get('assume_match', False)
+
+        # if this link has an associated topics_link row but that row has been deleted, ignore it.
+        # this can be used to delete spam urls from topic_links during the spidering process.
+        if topic_links_id and not db.find_by_id('topic_links', topic_links_id):
+            continue
+
+        tfu = {
+            'topics_id': topic['topics_id'],
+            'url': link['url'],
+            'state': 'pending',
+            'assume_match': assume_match,
+            'topic_links_id': topic_links_id}
+        tfu = db.create('topic_fetch_urls', tfu)
+
+        tfus.append(tfu)
+
+        queue_topic_fetch_url(tfu)
+
+    return tfus
+
+
+def _fetch_twitter_urls(db: DatabaseHandler, topic: dict, tfu_ids: list) -> None:
+    """
+    Send topic_fetch_urls to fetch_twitter_urls queue and wait for the jobs to complete.
+    """
+    # we run into quota limitations sometimes and need a longer timeout
+    twitter_poll_timeout = JOB_POLL_TIMEOUT * 5
+
+    twitter_tfu_ids = db.query(
+        """
+        select topic_fetch_urls_id
+            from topic_fetch_urls tfu
+            where
+                tfu.state = 'tweet pending' and
+                tfu.topic_fetch_urls_id = any(%(a)s)
+        """, {'a': tfu_ids}).flat()
+
+    if not twitter_tfu_ids:
+        return
+
+    tfu_ids_table = db.get_temporary_ids_table(twitter_tfu_ids)
+
+    JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls').add_to_queue(
+        topic_fetch_urls_ids=twitter_tfu_ids)
+
+    log.info(f"waiting for fetch twitter urls job for {len(twitter_tfu_ids)} urls")
+
+    # poll every sleep_time seconds waiting for the jobs to complete.
+    # raise McTopicMineError if the number of stories left to process
+    # has not shrunk for large_timeout seconds.  warn but continue if the number of stories left to process
+    # is only 5% of the total and short_timeout has passed (this is to make the topic not hang entirely because
+    # of one link extractor job error).
+    prev_num_queued_urls = len(twitter_tfu_ids)
+    last_change_time = time()
+    while True:
+        queued_tfus = db.query(
+            f"""
+            select tfu.*
+                from topic_fetch_urls tfu
+                    join {tfu_ids_table} ids on (tfu.topic_fetch_urls_id = ids.id)
+                where
+                    state in ('tweet pending')
+            """).hashes()
+
+        num_queued_urls = len(queued_tfus)
+
+        if num_queued_urls == 0:
+            break
+
+        if num_queued_urls != prev_num_queued_urls:
+            last_change_time = time()
+
+        if (time() - last_change_time) > twitter_poll_timeout:
+            raise McTopicMineError(f"Timed out waiting for twitter fetching {queued_tfus}")
+
+        log.info(f"{num_queued_urls} twitter urls left to fetch ...")
+
+        prev_num_queued_urls = num_queued_urls
+        sleep(JOB_POLL_WAIT)
+
+
+def list_pending_urls(pending_urls: list) -> str:
+    """list a sample of the pending urls for fetching"""
+    num_pending_urls = len(pending_urls)
+
+    num_printed_urls = min(num_pending_urls, 3)
+
+    random.shuffle(pending_urls)
+    urls = pending_urls[0:num_printed_urls]
+
+    return "\n".join([f"pending url: {url['url']} [{url['state']}: {url['fetch_date']}]" for url in urls])
+
+
+def fetch_links(db: DatabaseHandler, topic: dict, fetch_links: dict) -> None:
+    """
+    fetch the given links by creating topic_fetch_urls rows and sending them to the FetchLink queue
+    for processing.  wait for the queue to complete and return the resulting topic_fetch_urls.
+    """
+
+    log.info("fetch_links: queue links")
+    tfus = create_and_queue_topic_fetch_urls(db, topic, fetch_links)
+    num_queued_links = len(fetch_links)
+
+    log.info(f"waiting for fetch link queue: {num_queued_links} queued")
+
+    tfu_ids = [tfu['topic_fetch_urls_id'] for tfu in tfus]
+
+    requeues = 0
+    max_requeues = 1
+    max_requeue_jobs = 100
+    requeue_timeout = 30
+    instant_requeued = 0
+
+    # once the pool is this small, just requeue everything with a 0 per site throttle
+    instant_queue_size = 25
+
+    # how many times to requeues everything if there is no change for JOB_POLL_TIMEOUT seconds
+    full_requeues = 0
+    max_full_requeues = 1
+
+    last_pending_change = time()
+    last_num_pending_urls = 0
+    while True:
+        pending_urls = db.query(
+            """
+            select *, coalesce(fetch_date::text, 'null') fetch_date
+                from topic_fetch_urls
+                where
+                    topic_fetch_urls_id = any(%(a)s) and
+                    state in ('pending', 'requeued')
+            """,
+            {'a': tfu_ids}).hashes()
+
+        pending_url_ids = [u['topic_fetch_urls_id'] for u in pending_urls]
+
+        num_pending_urls = len(pending_url_ids)
+
+        log.info(f"waiting for fetch link queue: {num_pending_urls} links remaining ...")
+        log.info(list_pending_urls(pending_urls))
+
+        if num_pending_urls < 1:
+            break
+
+        # if we only have a handful of job left, requeue them all once with a 0 domain throttle
+        if not instant_requeued and num_pending_urls <= instant_queue_size:
+            instant_requeued = 1
+            [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id), 0) for id in pending_url_ids]
+            sleep(JOB_POLL_WAIT)
+            continue
+
+        time_since_change = time() - last_pending_change
+
+        # for some reason, the fetch_link queue is occasionally losing a small number of jobs.
+        if (time_since_change > requeue_timeout and
+                requeues < max_requeues and
+                num_pending_urls < max_requeue_jobs):
+            log.info(f"requeueing fetch_link {num_pending_urls} jobs ... [{requeue} requeues]")
+
+            # requeue with a domain_timeout of 0 so that requeued urls can ignore throttling
+            [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id), 0) for id in pending_url_ids]
+            requeues += 1
+            last_pending_change = time()
+
+        if time_since_change > JOB_POLL_TIMEOUT:
+            if num_pending_urls > max_requeue_jobs:
+                raise McTopicMineError("Timed out waiting for fetch link queue")
+            elif full_requeues < max_full_requeues:
+                [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id)) for id in pending_url_ids]
+                full_requeues += 1
+                last_pending_change = time()
+            else:
+                for id in pending_url_ids:
+                    db.update_by_id('topic_fetch_urls', id, {'state': 'python error', 'message': 'timed out'})
+
+                log.info(f"timed out {len(pending_url_ids)} urls")
+
+        if num_pending_urls < last_num_pending_urls:
+            last_pending_change = time()
+
+        last_num_pending_urls = num_pending_urls
+
+        sleep(JOB_POLL_WAIT)
+
+    _fetch_twitter_urls(db, topic, tfu_ids)
+
+    log.info("fetch_links: update topic seed urls")
+    db.query(
+        """
+        update topic_seed_urls tsu
+            set stories_id = tfu.stories_id, processed = 't'
+            from topic_fetch_urls tfu
+            where
+                tfu.url = tsu.url and
+                tfu.stories_id is not null and
+                tfu.topic_fetch_urls_id = any(%(a)s) and
+                tfu.topics_id = tsu.topics_id
+        """,
+        {'a': tfu_ids})
+
+    completed_tfus = db.query(
+        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
+        {'a':  tfu_ids}).hashes()
+
+    log.info("completed fetch link queue")
+
+    return completed_tfus
+
+
+def add_new_links_chunk(db: DatabaseHandler, topic: dict, iteration: int, new_links: list) -> None:
+    """
+    download any unmatched link in new_links, add it as a story, extract it, add any links to the topic_links list.
+
+    each hash within new_links can either be a topic_links hash or simply a hash with a {url} field.  if
+    the link is a topic_links hash, the topic_link will be updated in the database to point ref_stories_id
+    to the new link story.  For each link, set the {story} field to the story found or created for the link.
+    """
+    die_if_max_stories_exceeded(db, topic)
+
+    log.info("add_new_links_chunk: fetch_links")
+    topic_fetch_urls = fetch_links(db, topic, new_links)
+
+    log.info("add_new_links_chunk: mark topic links spidered")
+    link_ids = [l['topic_links_id'] for l in new_links if 'topic_links_id' in l]
+
+    db.query(
+        "update topic_links set link_spidered = 't' where topic_links_id = any(%(a)s)",
+        {'a': link_ids})
+
+
+def save_metrics(db: DatabaseHandler, topic: dict, iteration: int, num_links: int, elapsed_time: int) -> None:
+    """save a row in the topic_spider_metrics table to track performance of spider"""
+
+    topic_spider_metric = {
+        'topics_id': topic['topics_id'],
+        'iteration': iteration,
+        'links_processed': num_links,
+        'elapsed_time': elapsed_time
+    }
+
+    db.create('topic_spider_metrics', topic_spider_metric)
+
+
+def add_new_links(db:DatabaseHandler, topic:dict, iteration:int, new_links:list, state_updater:Callable) -> None:
+    """call add_new_links in chunks of ADD_NEW_LINKS_CHUNK_SIZE"""
+    log.info("add new links")
+
+    if not new_links:
+        return
+
+    spider_progress = get_spider_progress_description(db, topic, iteration, len(new_links))
+
+    num_links = len(new_links)
+
+    i = 0
+    while i < num_links:
+        start_time = time()
+
+        update_topic_state(db, state_updater, f"spider_progress iteration links: {i} / {num_links}")
+
+        chunk_links = new_links[i:i + ADD_NEW_LINKS_CHUNK_SIZE]
+        add_new_links_chunk(db, topic, iteration, chunk_links)
+
+        elapsed_time = time() - start_time
+        save_metrics(db, topic, iteration, len(chunk_links), elapsed_time)
+
+        i += ADD_NEW_LINKS_CHUNK_SIZE
+
+    mine_topic_stories(db, topic)
+
+
+def get_new_links(db: DatabaseHandler, iteration: int, topics_id: int) -> list:
+    """query the database for new links from stories below the given iteration."""
+
+    new_links = db.query(
+        """
+        select tl.*
+            from
+                topic_links tl
+                join topic_stories ts using ( topics_id )
+            where
+                tl.link_spidered = 'f' and
+                tl.stories_id = ts.stories_id and
+                (ts.iteration <= %(a)s or ts.iteration = 1000) and
+                ts.topics_id = %(b)s
+
+            limit %(c)s
+        """,
+        {'a': iteration, 'b': topics_id, 'c': SPIDER_LINKS_CHUNK_SIZE}).hashes()
+
+    return new_links
+
+
+def spider_new_links(
+        db: DatabaseHandler, topic: dict, iteration: int, state_updater: Optional[StateUpdater]) -> None:
+    """call add_new_links on topic_links for which link_spidered is false."""
+
+    while True:
+        log.info("querying new links ...")
+
+        db.query("drop table if exists _new_links")
+
+        num_new_links = db.query(
+            """
+            create temporary table _new_links as 
+                select tl.* 
+                    from topic_links tl, topic_stories ts
+                    where
+                        tl.link_spidered = 'f' and
+                        tl.stories_id = ts.stories_id and
+                        (ts.iteration <= %(a)s or ts.iteration = 1000) and
+                        ts.topics_id = %(b)s and
+                        tl.topics_id = %(b)s 
+                    order by random()
+            """,
+            {'a': iteration, 'b': topic['topics_id']}).rows()
+
+        db.query("create index _new_links_tl on _new_links (topic_links_id)")
+
+        if num_new_links < 1:
+            break
+
+        log.info(f"found {num_new_links} new links")
+
+        while True:
+            new_links = db.query("select * from _new_links limit %(a)s", {'a': SPIDER_LINKS_CHUNK_SIZE}).hashes()
+            if not new_links:
+                break
+
+            tl_ids = [n['topic_links_id'] for n in new_links]
+            db.query("delete from _new_links where topic_links_id = any(%(a)s)", {'a': tl_ids})
+            add_new_links(db, topic, iteration, new_links, state_updater)
+
+def get_spider_progress_description(db: DatabaseHandler, topic: dict, iteration: int, total_links: int) -> str:
+    """get short text description of spidering progress"""
+
+    log.info("get spider progress description")
+
+    topics_id = topic['topics_id']
+
+    total_stories = db.query(
+        "select count(*) from topic_stories where topics_id = %(a)s",
+        {'a': topics_id}).flat()[0]
+
+    stories_last_iteration = db.query(
+        "select count(*) from topic_stories where topics_id = %(a)s and iteration = %(b)s - 1",
+        {'a': topics_id, 'b': iteration}).flat()[0]
+
+    queued_links = db.query(
+        "select count(*) from topic_links where topics_id = %(a)s and not link_spidered",
+        {'a': topics_id}).flat()[0]
+
+    return (
+        f"spidering iteration: {iteration} stories last iteration / total: "
+        f"{stories_last_iteration} / {total_stories} links queued: {queued_links} iteration links: {total_links}"
+    )
+
+
+def run_spider(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> None:
+    """run the spider over any new links, for num_iterations iterations"""
+    log.info("run spider")
+
+    # before we run the spider over links, we need to make sure links have been generated for all existing stories
+    mine_topic_stories(db, topic)
+
+    iterations = topic['max_iterations']
+    [spider_new_links(db, topic, iterations, state_updater) for i in range(iterations)]
+
+
+def mine_topic_stories(db: DatabaseHandler, topic: dict) -> None:
+    """ mine for links any stories in topic_stories for this topic that have not already been mined"""
+    log.info("mine topic stories")
+
+    # skip for non-web topic, because the below query grows very large without ever mining links
+    if topic['platform'] != 'web':
+        log.info("skip link generation for non-web topic")
+        return
+
+    # chunk the story extractions so that one big topic does not take over the entire queue
+    i = 0
+    while True:
+        i += EXTRACT_STORY_LINKS_CHUNK_SIZE
+        log.info("mine topic stories: chunked i ...")
+        stories = db.query(
+            """
+            select s.*, ts.link_mined, ts.redirect_url
+                from snap.live_stories s
+                    join topic_stories ts on (s.stories_id = ts.stories_id and s.topics_id = ts.topics_id)
+                where
+                    ts.link_mined = false and
+                    ts.topics_id = %(a)s
+                limit %(b)s
+            """, {'a': topic['topics_id'], 'b': EXTRACT_STORY_LINKS_CHUNK_SIZE}).hashes()
+
+        num_stories = len(stories)
+
+        generate_topic_links(db, topic, stories)
+
+        if num_stories < EXTRACT_STORY_LINKS_CHUNK_SIZE:
+            break
+
+
+def import_seed_urls(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> int:
+    """ import all topic_seed_urls that have not already been processed
+
+    return number of seed urls imported
+    """
+    log.info("import seed urls")
+
+    topics_id = topic['topics_id']
+
+    # take care of any seed urls with urls that we have already processed for this topic
+    db.query(
+        """
+        update topic_seed_urls a set stories_id = b.stories_id, processed = 't'
+            from topic_seed_urls b
+            where a.url = b.url and
+                a.topics_id = %(a)s and b.topics_id = a.topics_id and
+                a.stories_id is null and b.stories_id is not null
+        """,
+        {'a': topics_id})
+
+    # randomly shuffle this query so that we don't block the extractor pool by throwing it all
+    # stories from a single media_id at once
+    seed_urls = db.query(
+        "select * from topic_seed_urls where topics_id = %(a)s and processed = 'f' order by random()",
+        {'a': topics_id}).hashes()
+
+    if not seed_urls:
+        return 0
+
+    # process these in chunks in case we have to start over so that we don't have to redo the whole batch
+    num_urls = len(seed_urls)
+    i = 0
+    while i < num_urls:
+        start_time = time()
+
+        update_topic_state(db, state_updater, f"importing seed urls: {i} / {num_urls}")
+
+        chunk_urls = seed_urls[i:i + ADD_NEW_LINKS_CHUNK_SIZE]
+
+        # verify that the seed urls exist and not processed, in case we have mucked with them while spidering
+        url_ids = [u['topic_seed_urls_id'] for u in chunk_urls]
+        seed_urls_chunk = db.query(
+            "select * from topic_seed_urls where topic_seed_urls_id = any(%(a)s) and not processed",
+            {'a': url_ids}).hashes()
+
+        add_new_links_chunk(db, topic, 0, seed_urls_chunk)
+
+        url_ids = [u['topic_seed_urls_id'] for u in seed_urls_chunk]
+
+        # update topic_seed_urls that were actually fetched
+        db.query(
+            """
+            update topic_seed_urls tsu
+                set stories_id = tfu.stories_id
+                from topic_fetch_urls tfu
+                where
+                    tsu.topics_id = tfu.topics_id and
+                    md5(tsu.url) = md5(tfu.url) and
+                    tsu.topic_seed_urls_id = any(%(a)s)
+            """,
+            {'a': url_ids})
+
+        # now update the topic_seed_urls that were matched
+        db.query(
+            """
+            update topic_seed_urls tsu
+                set processed = 't'
+                where
+                    tsu.topic_seed_urls_id = any(%(a)s) and
+                    processed = 'f'
+            """,
+            {'a': url_ids})
+
+        elapsed_time = time() - start_time
+        save_metrics(db, topic, 1, len(chunk_urls), elapsed_time)
+
+        i += ADD_NEW_LINKS_CHUNK_SIZE
+
+    # cleanup any topic_seed_urls pointing to a merged story
+    db.query(
+        """
+        UPDATE topic_seed_urls AS tsu
+        SET stories_id = tms.target_stories_id, processed = 't'
+        FROM topic_merged_stories_map AS tms,
+             topic_stories ts
+        WHERE tsu.stories_id = tms.source_stories_id
+          AND ts.stories_id = tms.target_stories_id
+          AND tsu.topics_id = ts.topics_id
+          AND ts.topics_id = %(a)s
+        """,
+        {'a': topic['topics_id']})
+
+    return len(seed_urls)
+
+
+def insert_topic_seed_urls(db: DatabaseHandler, topic_seed_urls: list) -> None:
+    """ insert a list of topic seed urls"""
+    log.info(f"inserting {len(topic_seed_urls)} topic seed urls ...")
+
+    for tsu in topic_seed_urls:
+        insert_tsu = {f: tsu[f] for f in ('stories_id', 'url', 'topics_id', 'assume_match')}
+        db.create('topic_seed_urls', insert_tsu)
+
+
+def _import_month_within_respider_date(topic: dict, month_offset: int) -> bool:
+    """ return True if the given month offset is within the dates that should be respidered.
+
+    always return True if there are no respider dates
+    """
+
+    start_date = topic['respider_start_date'] or ''
+    end_date = topic['respider_end_date'] or ''
+
+    if not (topic['respider_stories'] and (start_date or end_date)):
+        return True
+
+    month_date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d') + relativedelta(months=month_offset)
+    log.warning(month_date)
+
+    if end_date:
+        end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') + relativedelta(months=-1)
+        log.warning(f"end_date: {end_date}")
+        if month_date > end_date:
+            return True
+
+    if start_date:
+        start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
+        log.warning(f"start_date: {start_date}")
+        if month_date < start_date:
+            return True
+
+    return False
+
+
+def _search_for_stories_urls(db: DatabaseHandler, params: dict) -> list:
+    """Call search_solr_for_stories_ids() and then query postgres for the stories urls.
+
+    Return dicts with stories_id and url fields."""
+
+    stories_ids = mediawords.solr.search_solr_for_stories_ids(db, params)
+
+    stories = db.query("select stories_id,url from stories where stories_id = any(%(a)s)", {'a': stories_ids}).hashes()
+
+    return stories
+
+
+def import_solr_seed_query_month(db: DatabaseHandler, topic: dict, month_offset: int) -> bool:
+    """ import a single month of the solr seed query.  we do this to avoid giant queries that timeout in solr.
+
+    return True if the month_offset is valid for the topic."""
+    if not topic['platform'] == 'web':
+        return False
+
+    solr_query = mediawords.solr.query.get_full_solr_query_for_topic(db=db, topic=topic, month_offset=month_offset)
+
+    # this should return undef once the month_offset gets too big
+    if not solr_query:
+        return False
+
+    if not _import_month_within_respider_date(topic, month_offset):
+        return True
+
+    max_stories = topic['max_stories']
+
+    # if solr maxes out on returned stories, it returns a few documents less than the rows= parameter, so we
+    # assume that we hit the solr max if we are within 5% of the max stories
+    max_returned_stories = max_stories * 0.95
+
+    log.info(f"import solr seed query month offset {month_offset}")
+    solr_query['rows'] = max_stories
+
+    stories = _search_for_stories_urls(db, solr_query)
+
+    if len(stories) > max_returned_stories:
+        raise McTopicMineError(f"solr_seed_query returned more than {max_returned_stories} stories")
+
+    log.info(f"adding {len(stories)} stories to topic_seed_urls")
+
+    topic_seed_urls = []
+    for story in stories:
+        tsu = {
+            'topics_id': topic['topics_id'],
+            'url': story['url'],
+            'stories_id': story['stories_id'],
+            'assume_match': 'f'}
+        topic_seed_urls.append(tsu)
+
+    insert_topic_seed_urls(db, topic_seed_urls)
+
+    return True
+
+
+def import_solr_seed_query(db: DatabaseHandler, topic: dict) -> None:
+    """ import stories into topic_seed_urls from solr by running topic['solr_seed_query'] against solr.
+
+    if the solr query has already been imported, do nothing."""
+
+    log.info("import solr seed query")
+
+    if topic['solr_seed_query_run']:
+        return
+
+    month_offset = 0
+    while import_solr_seed_query_month(db, topic, month_offset):
+        month_offset += 1
+        pass
+
+    db.query("update topics set solr_seed_query_run = 't' where topics_id = %(a)s", {'a': topic['topics_id']})
+
+
+def all_facebook_data_fetched(db: DatabaseHandler, topic: dict) -> bool:
+    """ return True if there are no stories without facebook data"""
+
+    null_facebook_story = db.query(
+        """
+        select 1
+            from topic_stories cs
+                left join story_statistics ss on (cs.stories_id = ss.stories_id)
+            where
+                cs.topics_id = %(a)s and
+                ss.facebook_api_error is null and
+                (
+                    ss.stories_id is null or
+                    ss.facebook_share_count is null or
+                    ss.facebook_comment_count is null or
+                    ss.facebook_api_collect_date is null
+               )
+            limit 1
+        """,
+        {'a': topic['topics_id']}).hash()
+
+    return null_facebook_story is None
+
+
+def _add_topic_stories_to_facebook_queue(db: DatabaseHandler, topic: dict) -> None:
+    """ add all topic stories without facebook data to the queue"""
+    topics_id = topic['topics_id']
+
+    stories = db.query(
+        """
+        SELECT ss.*, cs.stories_id
+            FROM topic_stories cs
+                left join story_statistics ss on (cs.stories_id = ss.stories_id)
+            WHERE cs.topics_id = %(a)s
+            ORDER BY cs.stories_id
+        """,
+        {'a': topics_id}).hashes()
+
+    if not stories:
+        log.debug("No stories found for topic 'topic['name']'")
+
+    for ss in stories:
+        if (ss['facebook_api_error'] or
+                ss['facebook_api_collect_date'] is None or
+                ss['facebook_share_count'] is None or
+                ss['facebook_comment_count'] is None):
+            log.debug(f"Adding job for story {ss['stories_id']}")
+            args = {'stories_id': ss['stories_id']}
+
+            JobBroker(queue_name='MediaWords::Job::Facebook::FetchStoryStats').add_to_queue(
+                stories_id=ss['stories_id'])
+
+
+def fetch_social_media_data(db: DatabaseHandler, topic: dict) -> None:
+    """ send jobs to fetch facebook data for all stories that don't yet have it"""
+
+    log.info("fetch social media data")
+
+    cid = topic['topics_id']
+
+    _add_topic_stories_to_facebook_queue(db, topic)
+
+    poll_wait = 30
+    retries = int(MAX_SOCIAL_MEDIA_FETCH_TIME / poll_wait) + 1
+
+    for i in range(retries):
+        if all_facebook_data_fetched(db, topic):
+            return
+        sleep(poll_wait)
+
+    raise McTopicMineError("Timed out waiting for social media data")
+
+
+def check_job_error_rate(db: DatabaseHandler, topic: dict) -> None:
+    """ raise an error if error rate for link extraction or link fetching is too high"""
+
+    log.info("check job error rate")
+
+    fetch_stats = db.query(
+        """
+        select count(*) num, (state = 'python error') as error
+            from topic_fetch_urls
+                where topics_id = %(a)s
+                group by (state = 'python error')
+        """,
+        {'a': topic['topics_id']}).hashes()
+
+    num_fetch_errors = sum([s['num'] for s in fetch_stats if s['error']])
+    num_fetch_successes = sum([s['num'] for s in fetch_stats if not s['error']])
+
+    fetch_error_rate = num_fetch_errors / (num_fetch_errors + num_fetch_successes + 1)
+
+    log.info(f"Fetch error rate: {fetch_error_rate} ({num_fetch_errors} / {num_fetch_successes})")
+
+    if fetch_error_rate > MAX_JOB_ERROR_RATE:
+        raise McTopicMineError(f"Fetch error rate of {fetch_error_rate} is greater than {MAX_JOB_ERROR_RATE}")
+
+    link_stats = db.query(
+        """
+        select count(*) num, (length( link_mine_error) > 0) as error
+            from topic_stories
+                where topics_id = %(a)s
+                group by (length(link_mine_error) > 0)
+        """,
+        {'a': topic['topics_id']}).hashes()
+
+    num_link_errors = sum([s['num'] for s in link_stats if s['error']])
+    num_link_successes = sum([s['num'] for s in link_stats if not s['error']])
+
+    link_error_rate = num_link_errors / (num_link_errors + num_link_successes + 1)
+
+    log.info(f"Link error rate: {link_error_rate} ({num_link_errors} / {num_link_successes})")
+
+    if link_error_rate > MAX_JOB_ERROR_RATE:
+        raise McTopicMineError(f"link error rate of {link_error_rate} is greater than {MAX_JOB_ERROR_RATE}")
+
+
+def import_urls_from_seed_queries(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> None:
+    """ import urls from seed query """
+
+    topic_seed_queries = db.query(
+        "select * from topic_seed_queries where topics_id = %(a)s",
+        {'a': topic['topics_id']}).hashes()
+
+    log.debug("import seed urls from solr")
+    update_topic_state(db, state_updater, "importing solr seed query")
+    import_solr_seed_query(db, topic)
+
+    for tsq in topic_seed_queries:
+        tsq_dump = tsq['topic_seed_queries_id']
+        fetcher = topics_mine.fetch_topic_posts.get_post_fetcher(tsq)
+        if not fetcher:
+            raise McTopicMineError(f"unable to import seed urls for platform/source of seed query: {tsq_dump}")
+
+        log.debug(f"import seed urls from fetch_topic_posts:\n{tsq_dump}")
+        topics_mine.fetch_topic_posts.fetch_topic_posts(db, tsq)
+
+    db.query(
+        """
+        insert into topic_seed_urls
+            (url, topics_id, assume_match, source, topic_seed_queries_id, topic_post_urls_id)
+            select distinct
+                    tpu.url,
+                    tsq.topics_id,
+                    false,
+                    'topic_seed_queries',
+                    tsq.topic_seed_queries_id,
+                    tpu.topic_post_urls_id
+                from
+                    topic_post_urls tpu
+                    join topic_posts tp using (topic_posts_id)
+                    join topic_post_days tpd using (topic_post_days_id)
+                    join topic_seed_queries tsq using (topic_seed_queries_id)
+                where
+                    tsq.topics_id = %(a)s
+                on conflict (topic_post_urls_id) do nothing
+        """,
+        {'a': topic['topics_id']})
+
+
+def set_stories_respidering(db: DatabaseHandler, topic: dict, snapshots_id: int) -> None:
+    """ if the query or dates have changed, set topic_stories.link_mined to false so they will be respidered"""
+
+    if not topic['respider_stories']:
+        return
+
+    respider_start_date = topic['respider_start_date']
+    respider_end_date = topic['respider_end_date']
+
+    if not respider_start_date and not respider_end_date:
+        db.query("update topic_stories set link_mined = 'f' where topics_id = %(a)s", {'a': topic['topics_id']})
+        return
+
+    if respider_start_date:
+        db.query(
+            """
+            update topic_stories ts set link_mined = 'f'
+                from stories s
+                where
+                    ts.stories_id = s.stories_id and
+                    s.publish_date >= %(b)s and
+                    s.publish_date <= %(a)s and
+                    ts.topics_id = %(c)s
+            """,
+            {'a': respider_start_date, 'b': topic['start_date'], 'c': topic['topics_id']})
+
+        if snapshots_id:
+            db.update_by_id('snapshots', snapshots_id, {'start_date': topic['start_date']})
+            db.query(
+                """
+                update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null
+                where snapshots_id = %(a)s and start_date < %(b)s
+                """,
+                {'a': snapshots_id, 'b': respider_start_date})
+
+    if respider_end_date:
+        db.query(
+            """
+            update topic_stories ts set link_mined = 'f'
+                from stories s
+                where
+                    ts.stories_id = s.stories_id and
+                    s.publish_date >= %(a)s and
+                    s.publish_date <= %(b)s and
+                    ts.topics_id = %(c)s
+            """,
+            {'a': respider_end_date, 'b': topic['end_date'], 'c': topic['topics_id']})
+
+        if snapshots_id:
+            db.update_by_id('snapshots', snapshots_id, {'end_date': topic['end_date']})
+            db.query(
+                """
+                update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null
+                    where snapshots_id = %(a)s and end_date > %(b)s
+                """,
+                {'a': snapshots_id, 'b': respider_end_date})
+
+    db.update_by_id(
+        'topics',
+        topic['topics_id'],
+        {'respider_stories': 'f', 'respider_start_date': None, 'respider_end_date': None})
+
+
+def do_mine_topic(db: DatabaseHandler, topic: dict, options: dict) -> None:
+    """ mine the given topic for links and to recursively discover new stories on the web.
+
+    options:
+      import_only - only run import_seed_urls and import_solr_seed and exit
+      skip_post_processing - skip social media fetching and snapshotting
+      snapshots_id - associate topic with the given existing snapshot
+      state_updater - object that implements mediawords.job.StateUpdater
+    """
+    [options.setdefault(f, None) for f in 'state_updater import_only skip_post_processing snapshots_id'.split()]
+
+    state_updater = options['state_updater']
+
+    update_topic_state(db, state_updater, "importing seed urls")
+    import_urls_from_seed_queries(db, topic, state_updater)
+
+    update_topic_state(db, state_updater, "setting stories respidering...")
+    set_stories_respidering(db, topic, options['snapshots_id'])
+
+    # this may put entires into topic_seed_urls, so run it before import_seed_urls.
+    # something is breaking trying to call this perl.  commenting out for time being since we only need
+    # this when we very rarely change the foreign_rss_links field of a media source - hal
+    # update_topic_state(db, state_updater, "merging foreign rss stories")
+    # topics_base.stories.merge_foreign_rss_stories(db, topic)
+
+    update_topic_state(db, state_updater, "importing seed urls")
+    if import_seed_urls(db, topic, state_updater) > MIN_SEED_IMPORT_FOR_PREDUP_STORIES:
+        # merge dup stories before as well as after spidering to avoid extra spidering work
+        update_topic_state(db, state_updater, "merging duplicate stories")
+        topics_base.stories.find_and_merge_dup_stories(db, topic)
+
+    if not options.get('import_only', False):
+        update_topic_state(db, state_updater, "running spider")
+        run_spider(db, topic, state_updater)
+
+        check_job_error_rate(db, topic)
+
+        # merge dup media and stories again to catch dups from spidering
+        update_topic_state(db, state_updater, "merging duplicate stories")
+        topics_base.stories.find_and_merge_dup_stories(db, topic)
+
+        update_topic_state(db, state_updater, "merging duplicate media stories")
+        topics_base.stories.merge_dup_media_stories(db, topic)
+
+        if not options.get('skip_post_processing', False):
+            update_topic_state(db, state_updater, "fetching social media data")
+            fetch_social_media_data(db, topic)
+
+            update_topic_state(db, state_updater, "snapshotting")
+            snapshot_args = {'topics_id': topic['topics_id'], 'snapshots_id': options['snapshots_id']}
+            StatefulJobBroker(queue_name='MediaWords::Job::TM::SnapshotTopic').add_to_queue(snapshot_args)
+
+
+def mine_topic(db: DatabaseHandler, topic: dict, **options: dict) -> None:
+    """ wrap do_mine_topic in try and handle errors and state"""
+
+    # the topic spider can sit around for long periods doing solr queries, so we need to make sure the postgres
+    # connection does not get timed out
+    db.query("set idle_in_transaction_session_timeout = 0")
+
+    if topic['state'] != 'running':
+        topics_base.alert.send_topic_alert(db, topic, "started topic spidering")
+
+    get_session_lock(db=db, lock_type=LOCK_TYPE, lock_id=topic['topics_id'])
+
+    try:
+        do_mine_topic(db, topic, options)
+    except Exception as e:
+        topics_base.alert.send_topic_alert(db, topic, "aborted topic spidering due to error")
+        raise e
+
+    release_session_lock(db=db, lock_type=LOCK_TYPE, lock_id=topic['topics_id'])
+
+
+def run_worker_job(topics_id: int, snapshots_id: Optional[int] = None) -> None:
+    """run a topics-mine worker job."""
+    if isinstance(snapshots_id, bytes):
+        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)
+    if snapshots_id is not None:
+        snapshots_id = int(snapshots_id)
+
+    if isinstance(topics_id, bytes):
+        topics_id = decode_object_from_bytes_if_needed(topics_id)
+    if topics_id is not None:
+        topics_id = int(topics_id)
+
+    if not bool(topics_id):
+        raise McTopicMineException("topics_id must be set")
+
+    db = connect_to_db()
+
+    topic = db.require_by_id('topics', topics_id)
+
+    mine_topic(db=db, topic=topic, snapshots_id=snapshots_id)
diff --git a/apps/topics-mine/src/python/topics_mine/test.py b/apps/topics-mine/src/python/topics_mine/test.py
new file mode 100644
index 0000000000..efb2d6b156
--- /dev/null
+++ b/apps/topics-mine/src/python/topics_mine/test.py
@@ -0,0 +1,47 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def create_topic_for_import(db: mediawords.db.DatabaseHandler, num_stories : int = 200) -> dict:
+    """create a test topic and stories for import into the topic.
+
+    return the topic.
+    """
+    topic = create_test_topic(db, 'import')
+
+    stack = {'medium_1': {'feed_1': [f"story_{_}" for _ in range(num_stories)]}}
+    create_test_story_stack_for_indexing(db, stack)
+
+    all_media = db.query("select * from media").hashes()
+    all_stories = db.query("select * from stories").hashes()
+
+    topic['start_date'] = '2020-01-01'
+    topic['end_date'] = '2020-06-01'
+    topic['solr_seed_query'] = '*:*'
+    topic['solr_seed_query_run'] = False
+
+    db.update_by_id('topics', topic['topics_id'], topic)
+
+    for m in all_media:
+        db.query(
+            "insert into topics_media_map (topics_id, media_id) values (%(a)s, %(b)s)",
+            {'a': topic['topics_id'], 'b': m['media_id']})
+
+    # distribute one story each day.  this is kludgy but should work from a fresh databse with
+    # sequential stories_ids.  assumes that there are more stories than days in the date range above
+    stories = db.query("select * from stories").hashes()
+    for (i, story) in enumerate(stories):
+        db.query(
+                """
+                update stories set publish_date = %(a)s::timestamp + ((%(b)s || ' days')::interval)
+                    where stories_id = %(c)s
+                """,
+                {'a': topic['start_date'], 'b': i, 'c': story['stories_id']})
+
+    setup_test_index(db)
+
+    return topic
diff --git a/apps/topics-mine/src/requirements.txt b/apps/topics-mine/src/requirements.txt
new file mode 100644
index 0000000000..7d5173a96e
--- /dev/null
+++ b/apps/topics-mine/src/requirements.txt
@@ -0,0 +1,2 @@
+# test text generation
+lorem
diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm
deleted file mode 100644
index 643bd6860b..0000000000
--- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm
+++ /dev/null
@@ -1,27 +0,0 @@
-package AddTestTopicStories;
-
-use strict;
-use warnings;
-
-use MediaWords::CommonLibs;
-use MediaWords::Test::DB::Create;
-use MediaWords::TM::Stories;
-
-my $_topic_stories_medium_count = 0;
-
-sub add_test_topic_stories($$$$)
-{
-    my ( $db, $topic, $num_stories, $label ) = @_;
-
-    my $medium = MediaWords::Test::DB::Create::create_test_medium( $db, "$label  " . $_topic_stories_medium_count++ );
-    my $feed = MediaWords::Test::DB::Create::create_test_feed( $db, $label, $medium );
-
-    for my $i ( 1 .. $num_stories )
-    {
-        my $story = MediaWords::Test::DB::Create::create_test_story( $db, "$label $i", $feed );
-        MediaWords::TM::Stories::add_to_topic_stories( $db, $story, $topic );
-        $db->update_by_id( 'stories', $story->{ stories_id }, { publish_date => $topic->{ start_date } } );
-    }
-}
-
-1;
diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t
deleted file mode 100755
index eb4f2bbdb8..0000000000
--- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t
+++ /dev/null
@@ -1,52 +0,0 @@
-use strict;
-use warnings;
-
-use Test::Deep;
-use Test::More tests => 3;
-
-use MediaWords::CommonLibs;
-use MediaWords::DB;
-use MediaWords::Test::DB::Create;
-use MediaWords::TM::Mine;
-
-use FindBin;
-use lib $FindBin::Bin;
-
-use AddTestTopicStories;
-
-sub test_die_if_max_stories_exceeded($)
-{
-    my ( $db ) = @_;
-
-    my $label = "test_die_if_max_stories_exceeded";
-
-    my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label );
-
-    $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, { max_stories => 0 } );
-
-    AddTestTopicStories::add_test_topic_stories( $db, $topic, 101, $label );
-
-    eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); };
-    ok( $@, "$label adding 101 stories to 0 max_stories topic generates error" );
-
-    $db->query( "delete from topic_stories where topics_id = ?", $topic->{ topics_id } );
-
-    $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, { max_stories => 100 } );
-
-    AddTestTopicStories::add_test_topic_stories( $db, $topic, 99, $label );
-    eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); };
-    ok( !$@, "$label adding 999 stories to a 100 max_stories does not generate an error: $@" );
-
-    AddTestTopicStories::add_test_topic_stories( $db, $topic, 102, $label );
-    eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); };
-    ok( $@, "$label adding 2001 stories to a 100 max_stories generates an error" );
-}
-
-sub main
-{
-    my $db = MediaWords::DB::connect_to_db();
-
-    test_die_if_max_stories_exceeded( $db );
-}
-
-main();
diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t
deleted file mode 100644
index a8c3b29998..0000000000
--- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t
+++ /dev/null
@@ -1,87 +0,0 @@
-use strict;
-use warnings;
-
-use Test::Deep;
-use Test::More; 
-
-use MediaWords::CommonLibs;
-use MediaWords::DB;
-use MediaWords::Test::DB::Create;
-use MediaWords::TM::Mine;
-
-use FindBin;
-use lib $FindBin::Bin;
-
-sub test_import_urls_from_seed_queries($)
-{
-    my ( $db ) = @_;
-
-    my $label = "test_import";
-
-    my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label );
-
-    $topic->{ start_date } = '2019-01-01';
-    $topic->{ end_date }   = '2019-02-01';
-    $topic->{ platform }   = 'generic_post';
-    $topic->{ mode }       = 'web';
-    $topic->{ pattern }    = 'foo';
-
-    $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, $topic );
-
-        # posts.append({
-        #     'post_id': post_id,
-        #     'content': "sample post for id id %s" % test_url,
-        #     'publish_date': publish_date,
-        #     'url': test_url,
-        #     'author': 'user-%s' % user_id,
-        #     'channel': 'channel-%s' % user_id,
-        # })
-    my $posts_csv = <<CSV;
-post_id,content,publish_date,url,author,channel
-1,foo http://foo.com,2019-01-02,http://mock.post/1,author 1,channel 1
-CSV
-
-    my $posts_2_csv = <<CSV;
-post_id,content,publish_date,url,author,channel
-2,foo http://bar.com,2019-01-02,http://mock.post/2,author 2,channel 2
-CSV
-
-    my $topic_seed_query_data = {
-        topics_id => $topic->{ topics_id },
-        source => 'csv',
-        platform => 'generic_post',
-        query => $posts_csv
-    };
-    my $topic_seed_query = $db->create( 'topic_seed_queries', $topic_seed_query_data );
-
-    $topic_seed_query_data->{ query } = $posts_2_csv;
-    my $topic_seed_query_2 = $db->create( 'topic_seed_queries', $topic_seed_query_data );
-
-    MediaWords::TM::Mine::import_urls_from_seed_queries( $db, $topic );
-
-    my $topic_posts = $db->query( <<SQL, $topic->{ topics_id } )->hashes();
-select * 
-    from topic_posts tp
-        join topic_post_days tpd using ( topic_post_days_id )
-        join topic_seed_queries tsq using ( topic_seed_queries_id )
-    where topics_id = ?
-SQL
-
-    is ( scalar( @{ $topic_posts } ), 2, "number of topic posts" );
-
-    my $tsus = $db->query( "select * from topic_seed_urls where topics_id = ?", $topic->{ topics_id } )->hashes();
-
-    is( scalar( @{ $tsus } ), 2, "number of seed urls" );
-
-}
-
-sub main
-{
-    my $db = MediaWords::DB::connect_to_db();
-
-    test_import_urls_from_seed_queries( $db );
-
-    done_testing();
-}
-
-main();
diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t
deleted file mode 100755
index 019e075fd9..0000000000
--- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t
+++ /dev/null
@@ -1,122 +0,0 @@
-use strict;
-use warnings;
-
-use Test::Deep;
-use Test::More tests => 4;
-
-use MediaWords::CommonLibs;
-use MediaWords::DB;
-use MediaWords::Test::DB::Create;
-use MediaWords::TM::Mine;
-
-use FindBin;
-use lib $FindBin::Bin;
-
-use AddTestTopicStories;
-
-sub test_respider($)
-{
-    my ( $db ) = @_;
-
-    my $label = "test_respider";
-
-    my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label );
-
-    $topic->{ start_date } = '2017-01-01';
-    $topic->{ end_date }   = '2018-01-01';
-
-    $topic = $db->update_by_id(
-        'topics',
-        $topic->{ topics_id },
-        { max_stories => 0, start_date => '2017-01-01', end_date => '2018-01-01' }
-    );
-
-    my $num_topic_stories = 101;
-    AddTestTopicStories::add_test_topic_stories( $db, $topic, $num_topic_stories, $label );
-
-    # no respidering without respider_stories
-    $db->query( "update topic_stories set link_mined = 't'" );
-
-    MediaWords::TM::Mine::set_stories_respidering( $db, $topic, undef );
-
-    my ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat;
-    is( $got_num_respider_stories, 0, "no stories marked for respidering" );
-
-    # respider everything with respider_stories but no dates
-    $topic->{ respider_stories } = 1;
-
-    $db->query( "update topic_stories set link_mined = 't'" );
-
-    MediaWords::TM::Mine::set_stories_respidering( $db, $topic, undef );
-
-    ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat;
-    is( $got_num_respider_stories, $num_topic_stories, "all stories marked for respidering" );
-
-    # respider stories within the range of changed dates
-    my $topic_update = {
-        respider_stories    => 't',
-        respider_end_date   => $topic->{ end_date },
-        respider_start_date => $topic->{ start_date },
-        end_date            => '2019-01-01',
-        start_date          => '2016-01-01',
-    };
-    $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, $topic_update );
-
-    $db->query( "update topic_stories set link_mined = 't'" );
-
-    my $num_date_changes = 10;
-    $db->query( "update stories set publish_date = '2017-06-01'" );
-    $db->query( <<SQL, '2018-06-01', $num_date_changes );
-update stories set publish_date = ? where stories_id in 
-    (select stories_id from stories order by stories_id limit ?)
-SQL
-    $db->query( <<SQL, '2016-06-01', $num_date_changes );
-update stories set publish_date = ? where stories_id in 
-    (select stories_id from stories order by stories_id desc limit ?)
-SQL
-
-    my $snapshot = {
-        topics_id     => $topic->{ topics_id },
-        snapshot_date => MediaWords::Util::SQL::sql_now(),
-        start_date    => $topic->{ start_date },
-        end_date      => $topic->{ end_date }
-    };
-    $snapshot = $db->create( 'snapshots', $snapshot );
-
-    my $timespan_dates =
-      [ [ '2017-01-01', '2017-01-31' ], [ '2017-12-20', '2018-01-20' ], [ '2016-12-20', '2017-01-20' ] ];
-    for my $dates ( @{ $timespan_dates } )
-    {
-        my ( $start_date, $end_date ) = @{ $dates };
-        my $timespan = {
-            snapshots_id      => $snapshot->{ snapshots_id },
-            start_date        => $start_date,
-            end_date          => $end_date,
-            period            => 'monthly',
-            story_count       => 0,
-            story_link_count  => 0,
-            medium_count      => 0,
-            medium_link_count => 0,
-            post_count        => 0
-        };
-        $timespan = $db->create( 'timespans', $timespan );
-    }
-
-    MediaWords::TM::Mine::set_stories_respidering( $db, $topic, $snapshot->{ snapshots_id } );
-
-    ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat;
-    is( $got_num_respider_stories, 2 * $num_date_changes, "dated stories marked for respidering" );
-
-    my ( $got_num_archived_timespans ) =
-      $db->query( "select count(*) from timespans where archive_snapshots_id = ?", $snapshot->{ snapshots_id } )->flat;
-    is( $got_num_archived_timespans, 2, "number of archive timespans" );
-}
-
-sub main
-{
-    my $db = MediaWords::DB::connect_to_db();
-
-    test_respider( $db );
-}
-
-main();
diff --git a/apps/topics-mine/tests/perl/test_cd_live_stories.t b/apps/topics-mine/tests/perl/test_cd_live_stories.t
deleted file mode 100644
index f1faa7b413..0000000000
--- a/apps/topics-mine/tests/perl/test_cd_live_stories.t
+++ /dev/null
@@ -1,183 +0,0 @@
-use strict;
-use warnings;
-
-# test that inserts and updates on stories in topic_stories are correctly mirrored to snap.live_stories
-
-use English '-no_match_vars';
-
-use Test::More tests => 14;
-use Test::Deep;
-
-use MediaWords::DB;
-use MediaWords::Util::SQL;
-
-BEGIN
-{
-    use_ok( 'MediaWords::DB' );
-}
-
-sub add_topic_story
-{
-    my ( $db, $topic, $story ) = @_;
-
-    $db->create( 'topic_stories', { stories_id => $story->{ stories_id }, topics_id => $topic->{ topics_id } } );
-}
-
-sub test_live_story_matches
-{
-    my ( $db, $topic, $story, $test_label ) = @_;
-
-    my $live_story = $db->query( <<END, $topic->{ topics_id }, $story->{ stories_id } )->hash;
-select * from snap.live_stories where topics_id = ? and stories_id = ?
-END
-
-    delete( $live_story->{ topics_id } );
-    delete( $live_story->{ topic_stories_id } );
-
-    $live_story->{ publish_date } =~ s/T/ /g;
-    $live_story->{ collect_date } =~ s/T/ /g;
-    $story->{ publish_date } =~ s/T/ /g;
-    $story->{ collect_date } =~ s/T/ /g;
-
-    cmp_deeply( $live_story, $story, "$test_label: $story->{ title } should be in $topic->{ name } and match story" );
-}
-
-sub test_live_story_absent
-{
-    my ( $db, $topic, $story, $test_label ) = @_;
-
-    my $live_story = $db->query( <<END, $topic->{ topics_id }, $story->{ stories_id } )->hash;
-select * from snap.live_stories where topics_id = ? and stories_id = ?
-END
-    is( $live_story, undef, "$test_label: \$story->{ title } should be absent from \$topic->{ title }" );
-}
-
-sub update_story
-{
-    my ( $db, $story ) = @_;
-
-    $story->{ url }         ||= '/' . rand();
-    $story->{ guid }        ||= '/' . rand();
-    $story->{ title }       ||= ' ' . rand();
-    $story->{ description } ||= ' ' . rand();
-    $story->{ publish_date } = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - int( rand( 100000 ) ) );
-    $story->{ collect_date } = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - int( rand( 100000 ) ) );
-
-    $db->update_by_id( 'stories', $story->{ stories_id }, $story );
-
-    return $db->find_by_id( 'stories', $story->{ stories_id } );
-}
-
-sub test_live_stories
-{
-    my ( $db ) = @_;
-
-    my $medium = {
-        name => "test live stories",
-        url  => "url://test/live/stories",
-    };
-    $medium = $db->create( 'media', $medium );
-
-    my $topic_a = {
-        name                => 'topic a',
-        pattern             => '',
-        solr_seed_query     => '',
-        solr_seed_query_run => 'f',
-        description         => 'topic A',
-        start_date          => '2017-01-01',
-        end_date            => '2017-02-01',
-        job_queue           => 'mc',
-        max_stories         => 100_000,
-        platform            => 'web'
-    };
-    $topic_a = $db->create( 'topics', $topic_a );
-
-    my $topic_b = {
-        name                => 'topic b',
-        pattern             => '',
-        solr_seed_query     => '',
-        solr_seed_query_run => 'f',
-        description         => 'topic B',
-        start_date          => '2017-01-01',
-        end_date            => '2017-02-01',
-        job_queue           => 'mc',
-        max_stories         => 100_000,
-        platform            => 'web'
-    };
-    $topic_b = $db->create( 'topics', $topic_b );
-
-    my $story_a = {
-        media_id      => $medium->{ media_id },
-        url           => 'url://story/a',
-        guid          => 'guid://story/a',
-        title         => 'story a',
-        description   => 'description a',
-        publish_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 100000 ),
-        collect_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 200000 ),
-        full_text_rss => 't'
-    };
-    $story_a = $db->create( 'stories', $story_a );
-
-    my $story_b = {
-        media_id      => $medium->{ media_id },
-        url           => 'url://story/b',
-        guid          => 'guid://story/b',
-        title         => 'story b',
-        description   => 'description b',
-        publish_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 300000 ),
-        collect_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 400000 ),
-        full_text_rss => 'f'
-    };
-    $story_b = $db->create( 'stories', $story_b );
-
-    my $story_c = {
-        media_id      => $medium->{ media_id },
-        url           => 'url://story/c',
-        guid          => 'guid://story/c',
-        title         => 'story c',
-        description   => 'description c',
-        publish_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 500000 ),
-        collect_date  => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 600000 ),
-        full_text_rss => 'f'
-    };
-    $story_c = $db->create( 'stories', $story_c );
-
-    my $live_story = $db->query( "select * from snap.live_stories" )->hash;
-    is( $live_story, undef, "live stories empty before cs insert" );
-
-    add_topic_story( $db, $topic_a, $story_a );
-    add_topic_story( $db, $topic_b, $story_b );
-    add_topic_story( $db, $topic_a, $story_c );
-    add_topic_story( $db, $topic_b, $story_c );
-
-    test_live_story_matches( $db, $topic_a, $story_a, "after insert" );
-    test_live_story_absent( $db, $topic_b, $story_a, "after insert" );
-
-    test_live_story_matches( $db, $topic_b, $story_b, "after insert" );
-    test_live_story_absent( $db, $topic_a, $story_b, "after insert" );
-
-    test_live_story_matches( $db, $topic_a, $story_c, "after insert" );
-    test_live_story_matches( $db, $topic_b, $story_c, "after insert" );
-
-    $story_a = update_story( $db, $story_a );
-    $story_b = update_story( $db, $story_b );
-    $story_c = update_story( $db, $story_c );
-
-    test_live_story_matches( $db, $topic_a, $story_a, "after update" );
-    test_live_story_absent( $db, $topic_b, $story_a, "after update" );
-
-    test_live_story_matches( $db, $topic_b, $story_b, "after update" );
-    test_live_story_absent( $db, $topic_a, $story_b, "after update" );
-
-    test_live_story_matches( $db, $topic_a, $story_c, "after update" );
-    test_live_story_matches( $db, $topic_b, $story_c, "after update" );
-}
-
-sub main
-{
-    my $db = MediaWords::DB::connect_to_db();
-
-    test_live_stories( $db );
-}
-
-main();
diff --git a/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t b/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t
deleted file mode 100644
index 091b27c979..0000000000
--- a/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t
+++ /dev/null
@@ -1,56 +0,0 @@
-use strict;
-use warnings;
-
-# test TM::Mine::_import_month_within_respider_date
-
-use English '-no_match_vars';
-
-use Test::More;
-
-use MediaWords::TM::Mine;
-
-sub test_import_month_within_respider_date()
-{
-    my $topic = {
-        start_date => '2019-01-01',
-        end_date => '2019-06-01',
-        respider_stories => 'f',
-        respider_start_date => undef,
-        respider_end_date => undef
-    };
-
-    # if none of the respider setting are correct, we should always return true
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) );
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 1 ) );
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 100 ) );
-
-    # if respider_stories is true but neither respider date is set, always return true
-    $topic->{ respider_stories } = 1;
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) );
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 1 ) );
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 100 ) );
-
-    # should only import the dates after the respider end date
-    $topic->{ respider_end_date } = '2019-05-01';
-    ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic,  0 ) );
-    ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 3 ) );
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 4 ) );
-
-    # make sure we capture the whole previous month if the end date is within a month
-    $topic->{ respider_end_date } = '2019-04-02';
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic,  3 ) );
-
-    # should only import the dates before the repsider start date
-    $topic->{ respider_start_date } = '2019-02-01';
-    ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic,  0 ) );
-    ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic,  1 ) );
-}
-
-sub main
-{
-    test_import_month_within_respider_date();
-
-    done_testing();
-}
-
-main();
diff --git a/apps/topics-mine/tests/perl/test_tm_mine.t b/apps/topics-mine/tests/perl/test_tm_mine.t
deleted file mode 100644
index 2f3d96805a..0000000000
--- a/apps/topics-mine/tests/perl/test_tm_mine.t
+++ /dev/null
@@ -1,557 +0,0 @@
-use strict;
-use warnings;
-
-# basic intergration test for topic mapper's spider
-
-use Modern::Perl "2015";
-use MediaWords::CommonLibs;
-
-use English '-no_match_vars';
-
-use Data::Dumper;
-use Digest::MD5 qw(md5_hex);
-use MediaWords::Test::HashServer;
-use Readonly;
-use Sys::Hostname;
-use Test::More;
-use Text::Lorem::More;
-
-use MediaWords::DB;
-use MediaWords::TM::Mine;
-use MediaWords::Util::SQL;
-use MediaWords::Util::Web;
-
-Readonly my $BASE_PORT => 8890;
-
-Readonly my $NUM_SITES          => 5;
-Readonly my $NUM_PAGES_PER_SITE => 10;
-Readonly my $NUM_LINKS_PER_PAGE => 2;
-
-Readonly my $TOPIC_PATTERN => 'FOOBARBAZ';
-
-sub get_html_link($)
-{
-    my ( $page ) = @_;
-
-    my $lorem = Text::Lorem::More->new();
-
-    if ( 0 && int( rand( 3 ) ) )
-    {
-        return "<a href='$page->{ url }'>" . $lorem->words( 2 ) . "</a>";
-    }
-    else
-    {
-        return $page->{ url };
-    }
-}
-
-sub generate_content_for_site($)
-{
-    my ( $site ) = @_;
-
-    my $lorem = Text::Lorem::More->new();
-
-    my $body = $lorem->sentences( 5 );
-
-    return <<HTML;
-<html>
-<head>
-    <title>$site->{ title }</title>
-</head>
-<body>
-    <p>
-    $body
-    </p>
-</body>
-</html>
-HTML
-}
-
-sub generate_content_for_page($$)
-{
-    my ( $site, $page ) = @_;
-
-    my $lorem = Text::Lorem::More->new();
-
-    my $num_links      = scalar( @{ $page->{ links } } );
-    my $num_paragraphs = int( rand( 10 ) + 3 ) + $num_links;
-
-    my $paragraphs = [];
-
-    for my $i ( 0 .. $num_paragraphs - 1 )
-    {
-        my $text = $lorem->sentences( 5 );
-        if ( $i < $num_links )
-        {
-            my $html_link = get_html_link( $page->{ links }->[ $i ] );
-            $text .= " $html_link";
-        }
-
-        push( @{ $paragraphs }, $text );
-    }
-
-    if ( rand( 2 ) < 1 )
-    {
-        push( @{ $paragraphs }, $lorem->words( 10 ) . " $TOPIC_PATTERN" );
-        $page->{ matches_topic } = 1;
-    }
-
-    my $dead_link_text = $lorem->sentences( 5 );
-    $dead_link_text .= " <a href='$page->{ url }/dead'>dead link</a>";
-
-    push( @{ $paragraphs }, $dead_link_text );
-
-    my $body = join( "\n\n", map { "<p>\n$_\n</p>" } @{ $paragraphs } );
-
-    return <<HTML;
-<html>
-<head>
-    <title>$page->{ title }</title>
-</head>
-<body>
-    $body
-</body>
-</html>
-HTML
-
-}
-
-sub generate_content_for_sites($)
-{
-    my ( $sites ) = @_;
-
-    for my $site ( @{ $sites } )
-    {
-        $site->{ content } = generate_content_for_site( $site );
-
-        for my $page ( @{ $site->{ pages } } )
-        {
-            $page->{ content } = generate_content_for_page( $site, $page );
-        }
-    }
-}
-
-# generate test set of sites
-sub get_test_sites()
-{
-    my $sites = [];
-    my $pages = [];
-
-    # my $base_port = $BASE_PORT + int( rand( 200 ) );
-    my $base_port = $BASE_PORT;
-
-    for my $site_id ( 0 .. $NUM_SITES - 1 )
-    {
-        my $port = $base_port + $site_id;
-
-        my $site = {
-            port  => $port,
-            id    => $site_id,
-
-            # Other containers will access this host to we have to set the
-            # actual hostname instead of just localhost
-            url   => "http://" . Sys::Hostname::hostname . ":$port/",
-
-            title => "site $site_id"
-        };
-
-        my $num_pages = int( rand( $NUM_PAGES_PER_SITE ) ) + 1;
-        for my $page_id ( 0 .. $num_pages - 1 )
-        {
-            my $date = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - ( rand( 365 ) * 86400 ) );
-
-            my $path = "page-$page_id";
-
-            my $page = {
-                id          => $page_id,
-                path        => "/$path",
-                url         => "$site->{ url }$path",
-                title       => "page $page_id",
-                pubish_date => $date,
-                links       => []
-            };
-
-            push( @{ $pages },           $page );
-            push( @{ $site->{ pages } }, $page );
-        }
-
-        push( @{ $sites }, $site );
-    }
-
-    my $all_pages = [];
-    map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites };
-    for my $page ( @{ $all_pages } )
-    {
-        my $num_links = int( rand( $NUM_LINKS_PER_PAGE ) );
-        for my $link_id ( 0 .. $num_links - 1 )
-        {
-            my $linked_page_id = int( rand( scalar( @{ $all_pages } ) ) );
-            my $linked_page    = $all_pages->[ $linked_page_id ];
-
-            unless ( MediaWords::Util::URL::urls_are_equal( $page->{ url }, $linked_page->{ url } ) )
-            {
-                push( @{ $page->{ links } }, $linked_page );
-            }
-        }
-    }
-
-    generate_content_for_sites( $sites );
-
-    return $sites;
-}
-
-# add a medium for each site so that the topic mapper's spider can find the medium that corresponds to each url
-sub add_site_media($$)
-{
-    my ( $db, $sites ) = @_;
-
-    for my $site ( @{ $sites } )
-    {
-        $site->{ medium } = $db->create(
-            'media',
-            {
-                url  => $site->{ url },
-                name => $site->{ title },
-            }
-        );
-    }
-}
-
-sub start_hash_servers($)
-{
-    my ( $sites ) = @_;
-
-    my $hash_servers = [];
-
-    for my $site ( @{ $sites } )
-    {
-        my $site_hash = {};
-
-        $site_hash->{ '/' } = $site->{ content };
-
-        map { $site_hash->{ $_->{ path } } = $_->{ content } } @{ $site->{ pages } };
-
-        my $hs = MediaWords::Test::HashServer->new( $site->{ port }, $site_hash );
-
-        DEBUG "starting hash server $site->{ id }";
-
-        $hs->start();
-
-        push( @{ $hash_servers }, $hs );
-    }
-
-    # wait for the hash servers to start
-    sleep( 1 );
-
-    return $hash_servers;
-}
-
-sub test_page($$$)
-{
-    my ( $label, $url, $expected_content ) = @_;
-
-    TRACE "test page: $label $url";
-
-    my $ua       = MediaWords::Util::Web::UserAgent->new();
-    my $request  = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $url );
-    my $response = $ua->request( $request );
-
-    ok( $response->is_success, "request success: $label $url" );
-
-    my $got_content = $response->decoded_content;
-
-    TRACE "got content";
-
-    is( $got_content, $expected_content, "simple page test: $label" );
-}
-
-sub test_pages($)
-{
-    my ( $sites ) = @_;
-
-    for my $site ( @{ $sites } )
-    {
-        DEBUG "testing pages for site $site->{ id }";
-        test_page( "site $site->{ id }", $site->{ url }, $site->{ content } );
-
-        map { test_page( "page $site->{ id } $_->{ id }", $_->{ url }, $_->{ content } ) } @{ $site->{ pages } };
-    }
-}
-
-sub seed_unlinked_urls($$$)
-{
-    my ( $db, $topic, $sites ) = @_;
-
-    my $all_pages = [];
-    map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites };
-
-    # do not seed urls that are linked directly from a page that is a topic match.
-    # this forces the test to succesfully discover those pages through spidering.
-    my $non_seeded_url_lookup = {};
-    for my $page ( @{ $all_pages } )
-    {
-        if ( $page->{ matches_topic } )
-        {
-            map { $non_seeded_url_lookup->{ $_->{ url } } = 1 } @{ $page->{ links } };
-        }
-    }
-
-    my $seed_pages = [];
-    for my $page ( @{ $all_pages } )
-    {
-        if ( $non_seeded_url_lookup->{ $page->{ url } } )
-        {
-            DEBUG( "non seeded url: $page->{ url }" );
-        }
-        else
-        {
-            DEBUG( "seed url: $page->{ url }" );
-            push( @{ $seed_pages }, $page );
-        }
-    }
-
-    for my $seed_page ( @{ $all_pages } )
-    {
-        $db->create(
-            'topic_seed_urls',
-            {
-                topics_id => $topic->{ topics_id },
-                url       => $seed_page->{ url }
-            }
-        );
-    }
-}
-
-sub create_topic($$)
-{
-    my ( $db, $sites ) = @_;
-
-    my $now        = MediaWords::Util::SQL::sql_now();
-    my $start_date = MediaWords::Util::SQL::increment_day( $now, -30 );
-    my $end_date   = MediaWords::Util::SQL::increment_day( $now, 30 );
-
-    my $topic = $db->create(
-        'topics',
-        {
-            name                => 'test topic',
-            description         => 'test topic',
-            pattern             => $TOPIC_PATTERN,
-            solr_seed_query     => 'stories_id:0',
-            solr_seed_query_run => 't',
-            start_date          => $start_date,
-            end_date            => $end_date,
-            job_queue           => 'mc',
-            max_stories         => 100_000,
-            platform            => 'web'
-        }
-    );
-
-    seed_unlinked_urls( $db, $topic, $sites );
-
-    # avoid race condition in TM::Mine
-    $db->create( 'tag_sets', { name => 'extractor_version' } );
-
-    return $topic;
-}
-
-sub test_topic_stories($$$)
-{
-    my ( $db, $topic, $sites ) = @_;
-
-    my $topic_stories = $db->query( <<SQL, $topic->{ topics_id } )->hashes;
-select cs.*, s.*
-    from topic_stories cs
-        join stories s on ( s.stories_id = cs.stories_id )
-    where cs.topics_id = ?
-SQL
-
-    my $all_pages = [];
-    map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites };
-
-    DEBUG "ALL PAGES: " . scalar( @{ $all_pages } );
-
-    my $topic_pages = [ grep { $_->{ matches_topic } } @{ $all_pages } ];
-
-    DEBUG "TOPIC PAGES: " . scalar( @{ $topic_pages } );
-
-    my $topic_pages_lookup = {};
-    map { $topic_pages_lookup->{ $_->{ url } } = $_ } @{ $topic_stories };
-
-    for my $topic_story ( @{ $topic_stories } )
-    {
-        ok( $topic_pages_lookup->{ $topic_story->{ url } }, "topic story found for topic page '$topic_story->{ url }'" );
-
-        delete( $topic_pages_lookup->{ $topic_story->{ url } } );
-    }
-
-    is( scalar( keys( %{ $topic_pages_lookup } ) ),
-        0, "missing topic story for topic pages: " . Dumper( values( %{ $topic_pages_lookup } ) ) );
-
-    # Wait for pending URLs to disappear
-    Readonly my $WAIT_PENDING_SECONDS => 10;
-    my $pending_count = 0;
-    for ( my $pending_retry = 0; $pending_retry <= $WAIT_PENDING_SECONDS; ++$pending_retry ) {
-        ( $pending_count ) = $db->query( "select count(*) from topic_fetch_urls where state ='pending'" )->flat;
-        if ( $pending_count > 0 ) {
-            WARN "Still $pending_count URLs are pending, will retry shortly";
-            sleep( 1 );
-        } else {
-            INFO "No more pending URLs, continuing";
-            last;
-        }
-    }
-    is( $pending_count, 0, "After waiting $WAIT_PENDING_SECONDS some URLs are still in 'pending' state" );
-
-    my ( $dead_link_count ) = $db->query( "select count(*) from topic_fetch_urls where state ='request failed'" )->flat;
-    is( $dead_link_count, scalar( @{ $topic_pages } ), "dead link count" );
-
-    if ( $dead_link_count != scalar( @{ $topic_pages } ) )
-    {
-        my $fetch_states = $db->query( "select count(*), state from topic_fetch_urls group by state" )->hashes();
-        WARN( "fetch states: " . Dumper( $fetch_states ) );
-
-        my $fetch_errors = $db->query( "select * from topic_fetch_urls where state = 'python error'" )->hashes();
-        WARN( "fetch errors: " . Dumper( $fetch_errors ) );
-    }
-}
-
-sub test_topic_links($$$)
-{
-    my ( $db, $topic, $sites ) = @_;
-
-    my $cid = $topic->{ topics_id };
-
-    my $cl = $db->query( "select * from topic_links" )->hashes;
-
-    TRACE "topic links: " . Dumper( $cl );
-
-    my $all_pages = [];
-    map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites };
-
-    for my $page ( @{ $all_pages } )
-    {
-        next if ( !$page->{ matches_topic } );
-
-        for my $link ( @{ $page->{ links } } )
-        {
-            next unless ( $link->{ matches_topic } );
-
-            my $topic_links = $db->query( <<SQL, $page->{ url }, $link->{ url }, $cid )->hashes;
-select *
-    from topic_links cl
-        join stories s on ( cl.stories_id = s.stories_id )
-    where
-        s.url = \$1 and
-        cl.url = \$2 and
-        cl.topics_id = \$3
-SQL
-
-            is( scalar( @{ $topic_links } ), 1, "number of topic_links for $page->{ url } -> $link->{ url }" );
-        }
-    }
-
-    my $topic_spider_metric = $db->query( <<SQL, $topic->{ topics_id } )->hash;
-select sum( links_processed ) links_processed from topic_spider_metrics where topics_id = ?
-SQL
-
-    ok( $topic_spider_metric,                                           "topic spider metrics exist" );
-    ok( $topic_spider_metric->{ links_processed } > scalar( @{ $cl } ), "metrics links_processed greater than topic_links" );
-}
-
-# test that no errors exist in the topics or snapshots tables
-sub test_for_errors($)
-{
-    my ( $db ) = @_;
-
-    my $error_topics = $db->query( "select * from topics where state = 'error'" )->hashes;
-
-    ok( scalar( @{ $error_topics } ) == 0, "topic errors: " . Dumper( $error_topics ) );
-
-    my $error_snapshots = $db->query( "select * from snapshots where state = 'error'" )->hashes;
-
-    ok( scalar( @{ $error_snapshots } ) == 0, "snapshot errors: " . Dumper( $error_snapshots ) );
-}
-
-sub test_spider_results($$$)
-{
-    my ( $db, $topic, $sites ) = @_;
-
-    test_topic_stories( $db, $topic, $sites );
-
-    test_topic_links( $db, $topic, $sites );
-
-    test_for_errors( $db );
-}
-
-sub get_site_structure($)
-{
-    my ( $sites ) = @_;
-
-    my $meta_sites = [];
-    for my $site ( @{ $sites } )
-    {
-        my $meta_site = { url => $site->{ url } };
-        for my $page ( @{ $site->{ pages } } )
-        {
-            my $meta_page = { url => $page->{ url }, matches_topic => $page->{ matches_topic } };
-            map { push( @{ $meta_page->{ links } }, $_->{ url } ) } @{ $page->{ links } };
-
-            $meta_page->{ content } = $page->{ content }
-              if ( $page->{ matches_topic } && $page->{ matches_topic } );
-
-            push( @{ $meta_site->{ pages } }, $meta_page );
-        }
-
-        push( @{ $meta_sites }, $meta_site );
-    }
-
-    return $meta_sites;
-}
-
-sub test_spider($)
-{
-    my ( $db ) = @_;
-
-    # we pseudo-randomly generate test data, but we want repeatable tests
-    srand( 3 );
-
-    MediaWords::Util::Mail::enable_test_mode();
-
-    my $sites = get_test_sites();
-
-    TRACE "SITE STRUCTURE " . Dumper( get_site_structure( $sites ) );
-
-    add_site_media( $db, $sites );
-
-    my $hash_servers = start_hash_servers( $sites );
-
-    test_pages( $sites );
-
-    my $topic = create_topic( $db, $sites );
-
-    my $mine_args = {
-        topics_id                       => $topic->{ topics_id },
-        skip_post_processing            => 1,                       #
-        cache_broken_downloads          => 0,                       #
-        import_only                     => 0,                       #
-        skip_outgoing_foreign_rss_links => 0,                       #
-        test_mode                       => 1
-    };
-
-    MediaWords::TM::Mine::mine_topic( $db, $topic, $mine_args );
-
-    test_spider_results( $db, $topic, $sites );
-
-    map { $_->stop } @{ $hash_servers };
-}
-
-sub main
-{
-    my $db = MediaWords::DB::connect_to_db();
-
-    test_spider( $db );
-
-    done_testing();
-}
-
-main();
diff --git a/apps/topics-mine/tests/python/test_add_new_links.py b/apps/topics-mine/tests/python/test_add_new_links.py
new file mode 100644
index 0000000000..41c036290a
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_add_new_links.py
@@ -0,0 +1,34 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_links():
+    db = mediawords.db.connect_to_db()
+    
+    num_urls = 100
+
+    topic = create_test_topic(db, 'foo')
+    create_test_topic_stories(db, topic, 1, num_urls);
+
+    # add a bunch of urls with bad urls.  the fetch-link job will fail with a python error
+    # but that's fine becase all we are testing here is that each url makes it into the job pool
+    db.query("delete from topic_links")
+    links = db.query(
+        """
+        insert into topic_links (topics_id, stories_id, url)
+            select topics_id, stories_id, 'U ' || stories_id::text from topic_stories
+            returning *
+        """).hashes()
+
+    topics_mine.mine.ADD_NEW_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1
+
+    topics_mine.mine.add_new_links(db, topic, 1, links, None)
+
+    count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0]
+    assert count_processed_tfus == num_urls
+
+    count_spidered_links = db.query("select count(*) from topic_links where link_spidered").flat()[0]
+    assert count_spidered_links == num_urls
diff --git a/apps/topics-mine/tests/python/test_check_error_rate.py b/apps/topics-mine/tests/python/test_check_error_rate.py
new file mode 100644
index 0000000000..c8cbc4fe94
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_check_error_rate.py
@@ -0,0 +1,60 @@
+import unittest
+
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+from topics_mine.mine import check_job_error_rate, McTopicMineError
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+class TestCheckJobErrorRate(unittest.TestCase):
+
+    def test_check_error_Rate(self):
+        db = mediawords.db.connect_to_db()
+
+        topic = create_test_topic(db, 'foo')
+
+        # first call should not raise an error because there are not topic_fetch_urls
+        check_job_error_rate(db, topic)
+
+        num_tfus = 100
+
+        for i in range(num_tfus):
+            tfu = {
+                'topics_id': topic['topics_id'],
+                'url': str(i),
+                'state': 'pending'
+            }
+            db.create('topic_fetch_urls', tfu)
+
+        # still should not return an error with all pending tfus
+        check_job_error_rate(db, topic)
+
+        db.query("update topic_fetch_urls set state = 'python error' where url = '1'")
+
+        # only one error, so still no exception
+        check_job_error_rate(db, topic)
+
+        db.query("update topic_fetch_urls set state = 'python error'")
+
+        # now with all errors we should get an exception
+        self.assertRaises(McTopicMineError, check_job_error_rate, db, topic)
+
+        db.query("update topic_fetch_urls set state = 'pending'")
+
+        num_stories = 100
+
+        create_test_topic_stories(db, topic, num_stories)
+
+        # should not return an error with no errors in topic_stories
+        check_job_error_rate(db, topic)
+
+        db.query("update topic_stories set link_mine_error = 'test error' where stories_id = 1")
+
+        # still should not throw an exception with only one error
+        check_job_error_rate(db, topic)
+
+        db.query("update topic_stories set link_mine_error = 'test error'")
+
+        # now throw an exception since there are too many errors
+        self.assertRaises(McTopicMineError, check_job_error_rate, db, topic)
diff --git a/apps/topics-mine/tests/python/test_fetch_links.py b/apps/topics-mine/tests/python/test_fetch_links.py
new file mode 100644
index 0000000000..e87dfb8aaa
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_fetch_links.py
@@ -0,0 +1,23 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+from topics_mine.mine import fetch_links
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_links():
+    db = mediawords.db.connect_to_db()
+    
+    topic = create_test_topic(db, 'foo')
+
+    num_urls = 100
+
+    # add a bunch of urls with bad urls.  the fetch-link job will fail with a python error
+    # but that's fine becase all we are testing here is that each url makes it into the job pool
+
+    links = [{'url': f"INVALID URL {i}"} for i in range(num_urls)]
+
+    fetch_links(db, topic, links)
+
+    count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0]
+    assert count_processed_tfus == num_urls
diff --git a/apps/topics-mine/tests/python/test_fetch_social_media_data.py b/apps/topics-mine/tests/python/test_fetch_social_media_data.py
new file mode 100644
index 0000000000..cec7c7f437
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_fetch_social_media_data.py
@@ -0,0 +1,25 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+from topics_mine.mine import fetch_social_media_data
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_social_media_data():
+    db = mediawords.db.connect_to_db()
+
+    num_stories = 20
+
+    topic = create_test_topic(db, 'foo')
+    create_test_topic_stories(db, topic, 1, num_stories)
+
+    db.query("update stories set url = stories_id::text")
+
+    fetch_social_media_data(db, topic)
+
+    num_fetched_stories = db.query(
+        "select count(*) from story_statistics where facebook_api_error like '%URL is not HTTP%'").flat()[0]
+
+    log.warning(db.query("select facebook_api_error from story_statistics").flat())
+
+    assert num_fetched_stories == num_stories
diff --git a/apps/topics-mine/tests/python/test_fetch_twitter_urls.py b/apps/topics-mine/tests/python/test_fetch_twitter_urls.py
new file mode 100644
index 0000000000..888cd12836
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_fetch_twitter_urls.py
@@ -0,0 +1,42 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+from topics_mine.mine import _fetch_twitter_urls
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_twitter_urls():
+    db = mediawords.db.connect_to_db()
+    
+    topic = create_test_topic(db, 'foo')
+
+    num_urls = 100
+
+    # add a bunch of urls with non-twitter urls.  the fetch-twitter-urls job will fail with a python error
+    # when the urls cannot be parsed for twitter statuses, but that's fine becase all we are testing here
+    # is that each url makes it into the fetch_twitter_url job pool
+
+    tfus = []
+    for i in range(num_urls):
+        tfu = {
+            'topics_id': topic['topics_id'],
+            'url': 'http://not.a.twitter.url',
+            'state':  'tweet pending'
+        }
+        tfu = db.create("topic_fetch_urls", tfu)
+
+        tfus.append(tfu)
+
+    tfu_ids = [tfu['topic_fetch_urls_id'] for tfu in tfus]
+
+    _fetch_twitter_urls(db, topic, tfu_ids)
+
+    # if every url passed to the queue gets tagged with a url error, that means they all got processed
+    # by the fetch-twitter-urls pool
+    count_processed_tfus = db.query(
+        """
+        select count(*) from topic_fetch_urls
+            where state = 'python error' and message like '%McFetchTwitterUrlsDataException%'
+        """).flat()[0]
+
+    assert count_processed_tfus == num_urls
diff --git a/apps/topics-mine/tests/python/test_generate_topic_links.py b/apps/topics-mine/tests/python/test_generate_topic_links.py
new file mode 100644
index 0000000000..06a7f11975
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_generate_topic_links.py
@@ -0,0 +1,29 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+from topics_mine.mine import generate_topic_links
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_generate_topic_links():
+    db = mediawords.db.connect_to_db()
+
+    num_stories = 100
+
+    topic = create_test_topic(db, 'foo')
+    create_test_topic_stories(db, topic, 1, num_stories)
+
+    stories = db.query("select * from stories").hashes()
+
+    num_topic_stories = db.query("select count(*) from topic_stories").flat()[0]
+    assert num_topic_stories == num_stories
+
+    db.query("update stories set description = 'http://foo.com/' || stories_id::text")
+
+    generate_topic_links(db, topic, stories)
+
+    num_unmined_stories = db.query("select count(*) from topic_stories where not link_mined").flat()[0]
+    assert num_unmined_stories == 0
+
+    num_mined_links = db.query("select count(*) from topic_links").flat()[0]
+    assert num_mined_links == num_stories
diff --git a/apps/topics-mine/tests/python/test_import_month_with_respider_date.py b/apps/topics-mine/tests/python/test_import_month_with_respider_date.py
new file mode 100644
index 0000000000..2a82fcafa5
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_import_month_with_respider_date.py
@@ -0,0 +1,38 @@
+from topics_mine.mine import _import_month_within_respider_date
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_import_month_with_respider_date():
+    topic = {
+        'start_date': '2019-01-01',
+        'end_date': '2019-06-01',
+        'respider_stories': 'f',
+        'respider_start_date': None,
+        'respider_end_date': None}
+
+    # if none of the respider setting are correct, we should always return true
+    assert _import_month_within_respider_date(topic, 0)
+    assert _import_month_within_respider_date(topic, 1)
+    assert _import_month_within_respider_date(topic, 100)
+
+    # if respider_stories is true but neither respider date is set, always return true
+    topic['respider_stories'] = 1
+    assert _import_month_within_respider_date(topic, 0)
+    assert _import_month_within_respider_date(topic, 1)
+    assert _import_month_within_respider_date(topic, 100)
+
+    # should only import the dates after the respider end date
+    topic['respider_end_date'] = '2019-05-01'
+    assert not _import_month_within_respider_date(topic,  0)
+    assert not _import_month_within_respider_date(topic, 3)
+    assert _import_month_within_respider_date(topic, 4)
+
+    # make sure we capture the whole previous month if the end date is within a month
+    topic['respider_end_date'] = '2019-04-02'
+    assert _import_month_within_respider_date(topic,  3)
+
+    # should only import the dates before the repsider start date
+    topic['respider_start_date'] = '2019-02-01'
+    assert _import_month_within_respider_date(topic,  0)
+    assert not _import_month_within_respider_date(topic,  1)
diff --git a/apps/topics-mine/tests/python/test_import_seed_urls.py b/apps/topics-mine/tests/python/test_import_seed_urls.py
new file mode 100644
index 0000000000..a62d74c100
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_import_seed_urls.py
@@ -0,0 +1,29 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_links():
+    db = mediawords.db.connect_to_db()
+    
+    num_urls = 100
+
+    topic = create_test_topic(db, 'foo')
+
+    for i in range(num_urls):
+        tsu = {
+            'topics_id': topic['topics_id'],
+            'processed': 'false',
+            'url': f'INVALID URL {i}'}
+        db.create('topic_seed_urls', tsu)
+
+    topics_mine.mine.ADD_NEW_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1
+    topics_mine.mine.import_seed_urls(db, topic, None)
+
+    count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0]
+    assert count_processed_tfus == num_urls
+
+    count_processed_urls = db.query("select count(*) from topic_seed_urls where processed").flat()[0]
+    assert count_processed_urls == num_urls
diff --git a/apps/topics-mine/tests/python/test_import_solr_seed_query.py b/apps/topics-mine/tests/python/test_import_solr_seed_query.py
new file mode 100644
index 0000000000..dd0e5a7f2e
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_import_solr_seed_query.py
@@ -0,0 +1,29 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index
+import topics_mine.mine
+import topics_mine.test
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_import_solr_seed_query():
+    db = mediawords.db.connect_to_db()
+    num_stories = 200
+
+    topic = topics_mine.test.create_topic_for_import(db=db, num_stories=num_stories)
+
+    topics_mine.mine.import_solr_seed_query(db, topic)
+
+    date_stories = db.query(
+        "select * from stories where publish_date <= %(a)s",
+        {'a': topic['end_date']}).hashes()
+
+    date_stories_urls = [s['url'] for s in date_stories]
+
+    count_topic_seed_urls = db.query(
+        "select count(distinct url) from topic_seed_urls where url = any(%(a)s)",
+        {'a': date_stories_urls}).flat()[0]
+
+    assert len(date_stories) > 0, f"offset {i}"
+    assert len(date_stories) == count_topic_seed_urls, f"topic seed urls for month offset {i}"
diff --git a/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py b/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py
new file mode 100644
index 0000000000..bd88388022
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py
@@ -0,0 +1,40 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic
+from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index
+import topics_mine.mine
+import topics_mine.test
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_import_solr_seed_query_month():
+    db = mediawords.db.connect_to_db()
+    num_stories = 200
+
+    topic = topics_mine.test.create_topic_for_import(db=db, num_stories=num_stories)
+
+    i = 0
+    while topics_mine.mine.import_solr_seed_query_month(db, topic, i):
+        date_stories = db.query(
+            """
+            select * from stories
+                where
+                    publish_date >= %(a)s::timestamp + ((%(b)s || ' months')::interval) and
+                    publish_date <= %(a)s::timestamp + ((%(c)s || ' months')::interval) and
+                    publish_date <= %(d)s
+            """,
+            {'a': topic['start_date'], 'b': i, 'c': i + 1, 'd': topic['end_date']}).hashes()
+
+        date_stories_urls = [s['url'] for s in date_stories]
+
+        count_topic_seed_urls = db.query(
+            "select count(distinct url) from topic_seed_urls where url = any(%(a)s)",
+            {'a': date_stories_urls}).flat()[0]
+
+        assert len(date_stories) > 0, f"offset {i}"
+        assert len(date_stories) == count_topic_seed_urls, f"topic seed urls for month offset {i}"
+
+        i += 1
+
+        
+
diff --git a/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py b/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py
new file mode 100644
index 0000000000..694a11c4a5
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py
@@ -0,0 +1,43 @@
+import csv
+import io
+
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_import_urls_from_seed_queries():
+    db = mediawords.db.connect_to_db()
+
+    num_stories = 100
+
+    topic = create_test_topic(db, 'foo')
+    topic['pattern'] = '.*'
+    topic = db.update_by_id('topics', topic['topics_id'], topic)
+    
+    date = topic['start_date']
+
+    posts = [{'author': i, 'publish_date': date, 'content': f'http://u.u/{i}'} for i in range(num_stories)]
+
+    csv_io = io.StringIO()
+    csv_writer = csv.DictWriter(csv_io, fieldnames=posts[0].keys())
+    csv_writer.writeheader()
+    [csv_writer.writerow(p) for p in posts]
+
+    seed_csv = csv_io.getvalue()
+
+    tsq = {
+        'topics_id': topic['topics_id'],
+        'source': 'csv',
+        'platform': 'generic_post',
+        'query': seed_csv
+    }
+    tsq = db.create('topic_seed_queries', tsq)
+
+    topics_mine.mine.import_urls_from_seed_queries(db, topic, None)
+
+    num_tsus = db.query("select count(distinct url) from topic_seed_urls").flat()[0]
+
+    assert num_tsus == num_stories
diff --git a/apps/topics-mine/tests/python/test_mine.py b/apps/topics-mine/tests/python/test_mine.py
new file mode 100644
index 0000000000..280e21e04e
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_mine.py
@@ -0,0 +1,407 @@
+import random
+import socket
+import time
+
+import lorem
+
+import mediawords.db
+import mediawords.test.hash_server
+import mediawords.util.sql
+from mediawords.util.web.user_agent import UserAgent
+from mediawords.util.web.user_agent.request.request import Request
+
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+BASE_PORT = 8890
+
+NUM_SITES = 5
+NUM_PAGES_PER_SITE = 10
+NUM_LINKS_PER_PAGE = 2
+
+TOPIC_PATTERN = 'FOOBARBAZ'
+
+def get_html_link(page):
+    return page['url']
+
+def lorem_sentences(n: int) -> str:
+    return ' '.join([lorem.sentence() for i in range(n)])
+
+def generate_content_for_site(site):
+    body = lorem_sentences(5)
+
+    return f"""
+        <html>
+        <head>
+            <title>site['title']</title>
+        </head>
+        <body>
+            <p>
+            body
+            </p>
+        </body>
+        </html>
+        """
+
+def randindex(n):
+    """generate a random int >=  0 and < n."""
+    return random.randint(0, n - 1)
+
+def generate_content_for_page(site, page):
+    num_links = len(page['links'])
+    num_paragraphs = int(randindex(10) + 3) + num_links
+
+    paragraphs = []
+
+    for i in range(num_paragraphs):
+        text = lorem_sentences(5)
+        if i < num_links:
+            html_link = get_html_link(page['links'][i])
+            text += f" {html_link}"
+
+        paragraphs.append(text)
+
+    if randindex(2) < 1:
+        paragraphs.append(lorem.sentence() + f" {TOPIC_PATTERN}")
+        page['matches_topic'] = 1
+
+    dead_link_text = lorem_sentences(5)
+    dead_link_text += f" <a href='{page['url']}/dead'>dead link</a>"
+
+    paragraphs.append(dead_link_text)
+
+    body = "\n\n".join([f"<p>\n{p}\n</p>" for p in paragraphs])
+
+    return f"""
+        <html>
+        <head>
+            <title>{page['title']}</title>
+        </head>
+        <body>
+            {body}
+        </body>
+        </html>
+    """
+
+def generate_content_for_sites(sites):
+    for site in sites:
+        site['content'] = generate_content_for_site(site)
+
+        for p in site['pages']:
+            p['content'] = generate_content_for_page(site, p)
+
+def get_test_sites():
+    """ generate test set of sites"""
+    sites = []
+    pages = []
+
+    # base_port = BASE_PORT + int(rand( 200) )
+    base_port = BASE_PORT
+
+    for site_id in range(NUM_SITES):
+        port = base_port + site_id
+        # other containers will access this host to we have to set the actual hostname instead of just localhost
+        host = socket.gethostname()
+
+        site = {
+            'port': port,
+            'id': site_id,
+            'url': f"http://{host}:{port}/",
+            'title': f"site {site_id}",
+            'pages': []
+        }
+
+        num_pages = int(randindex(NUM_PAGES_PER_SITE)) + 1
+        for page_id in range(num_pages):
+            date = mediawords.util.sql.get_sql_date_from_epoch(time.time() - (randindex(365) * 86400))
+
+            path = f"page-{page_id}"
+
+            page = {
+                'id': page_id,
+                'path': f"/{path}",
+                'url': f"{site['url']}{path}",
+                'title': f"page {page_id}",
+                'pubish_date': date,
+                'links': [],
+                'matches_topic': False
+            }
+
+            pages.append(page)
+            site['pages'].append(page)
+
+        sites.append(site)
+
+    for page in pages:
+        num_links = int(randindex(NUM_LINKS_PER_PAGE))
+        for link_id in range(num_links):
+            linked_page_id = int(randindex(len(pages)))
+            linked_page = pages[linked_page_id]
+
+            if not mediawords.util.url.urls_are_equal(page['url'], linked_page['url']):
+                page['links'].append(linked_page)
+
+    generate_content_for_sites(sites)
+
+    return sites
+
+def add_site_media(db, sites):
+    """add a medium for each site so that the spider can find the medium that corresponds to each url"""
+    for s in sites:
+        s['medium'] = db.create('media', {'url': s['url'], 'name': s['title']})
+
+def start_hash_servers(sites):
+    hash_servers = []
+
+    for site in sites:
+        site_hash = {}
+        site_hash['/'] = site['content']
+
+        for p in site['pages']:
+            site_hash[p['path']] = p['content']
+
+        hs = mediawords.test.hash_server.HashServer(port=site['port'], pages=site_hash)
+
+        log.debug(f"starting hash server {site['id']}")
+
+        hs.start()
+
+        hash_servers.append(hs)
+
+    # wait for the hash servers to start
+    time.sleep(1)
+
+    return hash_servers
+
+def validate_page(label, url, expected_content):
+
+    log.debug(f"test page: {label} {url}")
+
+    ua = UserAgent()
+    request = Request('get', url)
+    response = ua.request(request)
+
+    assert response.is_success(), f"request success: {label} {url}"
+
+    got_content = response.decoded_content()
+
+    log.debug("got content")
+
+    assert got_content == expected_content
+
+def validate_pages(sites):
+    for site in sites:
+        log.debug(f"testing pages for site {site['id']}")
+        validate_page(f"site {site['id']}", site['url'], site['content'])
+
+        [validate_page(f"page {site['id']} p{['id']}", p['url'], p['content']) for p in site['pages']]
+
+def seed_unlinked_urls(db, topic, sites):
+    all_pages = []
+    [all_pages.extend(s['pages']) for s in sites]
+
+    # do not seed urls that are linked directly from a page that is a topic match.
+    # this forces the test to succesfully discover those pages through spidering.
+    non_seeded_url_lookup = {}
+    for page in all_pages:
+        if page['matches_topic']:
+            for l in page['links']:
+                non_seeded_url_lookup[l['url']] = 1
+
+    seed_pages = []
+    for page in all_pages:
+        if non_seeded_url_lookup.get(page['url'], False):
+            log.debug(f"non seeded url: {page['url']}")
+        else:
+            log.debug(f"seed url: {page['url']}")
+            seed_pages.append(page)
+
+    [db.create('topic_seed_urls', {'topics_id': topic['topics_id'], 'url': p['url']}) for p in seed_pages]
+
+def create_topic(db, sites):
+    now = mediawords.util.sql.sql_now()
+    start_date = mediawords.util.sql.increment_day(now, -30)
+    end_date = mediawords.util.sql.increment_day(now, 30)
+
+    topic = {
+        'name': 'test topic',
+        'description': 'test topic',
+        'pattern': TOPIC_PATTERN,
+        'solr_seed_query': 'stories_id:0',
+        'solr_seed_query_run': 't',
+        'start_date': start_date,
+        'end_date': end_date,
+        'job_queue': 'mc',
+        'max_stories': 100_000,
+        'platform': 'web'
+    }
+    topic = db.create('topics', topic)
+
+    seed_unlinked_urls(db, topic, sites)
+
+    # avoid race condition in TM::Mine
+    db.create('tag_sets', {'name': 'extractor_version'})
+
+    return topic
+
+def validate_topic_stories(db, topic, sites):
+    topic_stories = db.query(
+        """
+        select cs.*, s.*
+            from topic_stories cs
+                join stories s on (s.stories_id = cs.stories_id)
+            where cs.topics_id = %(a)s
+        """,
+        {'a': topic['topics_id']}).hashes()
+
+    all_pages = []
+    [all_pages.extend(s['pages']) for s in sites]
+
+    log.info(f"ALL PAGES: {len(all_pages)}")
+
+    topic_pages = [p for p in all_pages if p['matches_topic']]
+
+    log.info(f"TOPIC PAGES: {len(topic_pages)}")
+
+    topic_pages_lookup = {s['url']: s for s in topic_stories}
+
+    log.info(f"TOPIC PAGES LOOKUP: {len(topic_pages_lookup)}")
+
+    for topic_story in topic_stories:
+        assert topic_pages_lookup.get(topic_story['url'], False)
+        del topic_pages_lookup[topic_story['url']]
+
+    assert len(topic_pages_lookup) == 0
+
+    # Wait for pending URLs to disappear
+    WAIT_PENDING_SECONDS = 10
+    pending_count = 0
+    pending_retry = 0
+    while pending_retry <= WAIT_PENDING_SECONDS:
+        pending_count = db.query("select count(*) from topic_fetch_urls where state ='pending'").flat()[0]
+        if pending_count > 0:
+            log.warning("Still pending_count URLs are pending, will retry shortly")
+            time.sleep(1)
+        else:
+            log.info("No more pending URLs, continuing")
+            break
+
+        pending_retry += 1
+
+    assert pending_count == 0, f"After waiting {WAIT_PENDING_SECONDS} some URLs are still in 'pending' state"
+
+    dead_link_count = db.query( "select count(*) from topic_fetch_urls where state ='request failed'").flat()[0]
+    dead_pages_count = db.query("select count(*) from topic_fetch_urls where url like '%dead%'").flat()[0]
+
+    if dead_link_count != dead_pages_count:
+        fetch_states = db.query("select count(*), state from topic_fetch_urls group by state" ).hashes()
+        log.info(f"fetch states: {fetch_states}")
+
+        fetch_errors = db.query("select * from topic_fetch_urls where state = 'python error'").hashes()
+        log.info(f"fetch errors: {fetch_errors}")
+
+    assert dead_link_count == dead_pages_count, "dead link count"
+
+def validate_topic_links(db, topic, sites):
+    cid = topic['topics_id']
+
+    topic_links = db.query("select * from topic_links").hashes()
+
+    log.info(f"TOPIC LINKS: {len(topic_links)}")
+
+    all_pages = []
+    [all_pages.extend(s['pages']) for s in sites]
+
+    for page in all_pages:
+        if not page['matches_topic']:
+            continue
+
+        for link in page['links']:
+            if not link['matches_topic']:
+                continue
+
+            topic_links = db.query(
+                """
+                select *
+                    from topic_links cl
+                        join stories s on (cl.stories_id = s.stories_id)
+                    where
+                        s.url = %(a)s and
+                        cl.url = %(b)s and
+                        cl.topics_id = %(c)s 
+                """,
+                {'a': page['url'], 'b': link['url'], 'c': cid}).hashes()
+
+            assert len(topic_links) == 1, f"number of topic_links for {page['url']} -> {link['url']}"
+
+    topic_spider_metric = db.query(
+        "select sum(links_processed) links_processed from topic_spider_metrics where topics_id = %(a)s",
+        {'a': cid}).hash()
+
+    assert topic_spider_metric,"topic spider metrics exist"
+    assert topic_spider_metric['links_processed'] > len(topic_links), "metrics links_processed greater than topic_links"
+
+def validate_for_errors(db):
+    """ test that no errors exist in the topics or snapshots tables"""
+    error_topics = db.query("select * from topics where state = 'error'").hashes()
+
+    assert len( error_topics) == 0, f"topic errors: {error_topics}"
+
+    error_snapshots = db.query("select * from snapshots where state = 'error'").hashes()
+
+    assert len( error_snapshots) == 0, f"snapshot errors:{error_snapshots}"
+
+def validate_spider_results(db, topic, sites):
+    validate_topic_stories(db, topic, sites)
+    validate_topic_links(db, topic, sites)
+    validate_for_errors(db)
+
+def get_site_structure(sites):
+    meta_sites = []
+    for site in sites:
+        meta_site = {'url': site['url'], 'pages': []}
+        for page in site['pages']:
+            meta_page = {'url': page['url'], 'matches_topic': page['matches_topic'], 'links': []}
+            [meta_page['links'].append(l['url']) for l in page['links']]
+
+            if page['matches_topic'] and meta_page['matches_topic']:
+                meta_page['content'] = page['content']
+
+            meta_site['pages'].append(meta_page)
+
+        meta_sites.append(meta_site)
+
+    return meta_sites
+
+def test_mine():
+    # we pseudo-randomly generate test data, but we want repeatable tests
+    random.seed(3)
+
+    db = mediawords.db.connect_to_db()
+
+    mediawords.util.mail.enable_test_mode()
+
+    sites = get_test_sites()
+
+    log.debug(f"SITE STRUCTURE {get_site_structure(sites)}")
+
+    add_site_media(db, sites)
+
+    hash_servers = start_hash_servers(sites)
+
+    validate_pages(sites)
+
+    topic = create_topic(db, sites)
+
+    topics_mine.mine.DOMAIN_TIMEOUT = 0
+
+    topics_mine.mine.mine_topic(
+        db=db,
+        topic=topic,
+        skip_post_processing=True)
+
+    validate_spider_results(db, topic, sites)
+
+    [hs.stop for hs in hash_servers]
diff --git a/apps/topics-mine/tests/python/test_mine_topic_stories.py b/apps/topics-mine/tests/python/test_mine_topic_stories.py
new file mode 100644
index 0000000000..76a686ac5f
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_mine_topic_stories.py
@@ -0,0 +1,21 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_links():
+    db = mediawords.db.connect_to_db()
+    
+    num_urls = 100
+
+    topic = create_test_topic(db, 'foo')
+    create_test_topic_stories(db, topic, 1, num_urls);
+
+    topics_mine.mine.EXTRACT_STORY_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1
+
+    topics_mine.mine.mine_topic_stories(db, topic)
+
+    count_spidered_stories = db.query("select count(*) from topic_stories where link_mined").flat()[0]
+    assert count_spidered_stories == num_urls
diff --git a/apps/topics-mine/tests/python/test_respider.py b/apps/topics-mine/tests/python/test_respider.py
new file mode 100644
index 0000000000..f58725a71a
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_respider.py
@@ -0,0 +1,105 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+import mediawords.util.sql
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_social_media_data():
+    db = mediawords.db.connect_to_db()
+
+    topic = create_test_topic(db, 'foo')
+
+    topic['start_date'] = '2017-01-01'
+    topic['end_date'] = '2018-01-01'
+
+    topic = db.update_by_id(
+        'topics',
+        topic['topics_id'],
+        { 'max_stories': 0, 'start_date': '2017-01-01', 'end_date': '2018-01-01' }
+    )
+
+    num_stories = 101
+    create_test_topic_stories(db, topic, 1, num_stories)
+
+    # no respidering without respider_stories
+    db.query("update topic_stories set link_mined = 't'")
+
+    topics_mine.mine.set_stories_respidering(db, topic, None)
+
+    got_num_respider_stories = db.query( "select count(*) from topic_stories where not link_mined" ).flat()[0]
+    assert got_num_respider_stories == 0
+
+    # respider everything with respider_stories but no dates
+    topic['respider_stories'] = 1
+
+    db.query("update topic_stories set link_mined = 't'")
+
+    topics_mine.mine.set_stories_respidering(db, topic, None)
+
+    got_num_respider_stories = db.query( "select count(*) from topic_stories where not link_mined" ).flat()[0]
+    assert got_num_respider_stories == num_stories
+
+    # respider stories within the range of changed dates
+    topic_update = {
+        'respider_stories': 't',
+        'respider_end_date': topic['end_date'],
+        'respider_start_date': topic['start_date'],
+        'end_date': '2019-01-01',
+        'start_date': '2016-01-01'
+    }
+
+    topic = db.update_by_id('topics', topic['topics_id'], topic_update)
+
+    db.query("update topic_stories set link_mined = 't'")
+
+    num_date_changes = 10
+    db.query("update stories set publish_date = '2017-06-01'")
+    db.query(
+        """
+        update stories set publish_date = %(a)s where stories_id in 
+            (select stories_id from stories order by stories_id limit %(b)s)
+        """,
+        {'a': '2018-06-01', 'b': num_date_changes})
+    db.query(
+        """
+        update stories set publish_date = %(a)s where stories_id in 
+            (select stories_id from stories order by stories_id desc limit %(b)s)
+        """,
+        {'a': '2016-06-01', 'b': num_date_changes})
+
+    snapshot = {
+        'topics_id': topic['topics_id'],
+        'snapshot_date': mediawords.util.sql.sql_now(),
+        'start_date': topic['start_date'],
+        'end_date': topic['end_date']}
+
+    snapshot = db.create('snapshots', snapshot)
+
+    timespan_dates = [['2017-01-01', '2017-01-31'], ['2017-12-20', '2018-01-20'], ['2016-12-20', '2017-01-20']]
+
+    for dates in timespan_dates:
+        (start_date, end_date) = dates
+        timespan = {
+            'snapshots_id': snapshot['snapshots_id'],
+            'start_date': start_date,
+            'end_date': end_date,
+            'period': 'monthly',
+            'story_count': 0,
+            'story_link_count': 0,
+            'medium_count': 0,
+            'medium_link_count': 0,
+            'post_count': 0}
+
+        timespan = db.create('timespans', timespan)
+
+    topics_mine.mine.set_stories_respidering(db, topic, snapshot['snapshots_id'])
+
+    got_num_respider_stories = db.query("select count(*) from topic_stories where not link_mined").flat()[0]
+    assert got_num_respider_stories == 2 * num_date_changes
+
+    got_num_archived_timespans = db.query(
+        "select count(*) from timespans where archive_snapshots_id = %(a)s",
+        {'a': snapshot['snapshots_id']}).flat()[0]
+    assert got_num_archived_timespans == 2
diff --git a/apps/topics-mine/tests/python/test_spider_new_links.py b/apps/topics-mine/tests/python/test_spider_new_links.py
new file mode 100644
index 0000000000..88657de29b
--- /dev/null
+++ b/apps/topics-mine/tests/python/test_spider_new_links.py
@@ -0,0 +1,32 @@
+import mediawords.db
+from mediawords.test.db.create import create_test_topic, create_test_topic_stories
+import topics_mine.mine
+
+from mediawords.util.log import create_logger
+log = create_logger(__name__)
+
+def test_fetch_links():
+    db = mediawords.db.connect_to_db()
+    
+    num_urls = 10
+
+    topic = create_test_topic(db, 'foo')
+    create_test_topic_stories(db, topic, 1, num_urls);
+
+    # add a bunch of urls with bad urls.  the fetch-link job will fail with a python error
+    # but that's fine becase all we are testing here is that each url makes it into the job pool
+    db.query("delete from topic_links")
+    links = db.query(
+        """
+        insert into topic_links (topics_id, stories_id, url)
+            select topics_id, stories_id, 'U ' || stories_id::text from topic_stories
+            returning *
+        """).hashes()
+
+    topics_mine.mine.spider_new_links(db, topic, 1, None)
+
+    count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0]
+    assert count_processed_tfus == num_urls
+
+    count_spidered_links = db.query("select count(*) from topic_links where link_spidered").flat()[0]
+    assert count_spidered_links == num_urls
diff --git a/dev/run_test.py b/dev/run_test.py
index 99703d9f10..72b1c92525 100755
--- a/dev/run_test.py
+++ b/dev/run_test.py
@@ -68,7 +68,7 @@ def docker_test_commands(all_apps_dir: str, test_file: str, verbose: bool) -> Li
 
     if test_file.endswith('.py'):
         test_command = [
-            'py.test', '-s', '-vv',
+                'py.test', '-s', '-vv',
 
             # Disable cache because it won't be preserved
             '-p', 'no:cacheprovider',