python爬虫url管理器[url_version_manage.py]

# -*- coding: utf-8 -*-

__author__ = 'vanxkr.com'

import sqlite3

db_name = 'test.db'

sql_drop_url = '''DROP TABLE IF EXISTS new_url;
                  DROP TABLE IF EXISTS old_url;'''

sql_create_new_url = '''CREATE TABLE IF NOT EXISTS new_urls (
  id  INTEGER        PRIMARY KEY ASC AUTOINCREMENT
                     UNIQUE
                     NOT NULL,
  url VARCHAR (1000) UNIQUE
                     NOT NULL ON CONFLICT IGNORE
);'''

sql_create_old_url = '''CREATE TABLE IF NOT EXISTS old_urls (
  id  INTEGER        PRIMARY KEY ASC AUTOINCREMENT
                     UNIQUE
                     NOT NULL,
  url VARCHAR (1000) UNIQUE
                     NOT NULL ON CONFLICT IGNORE
);'''


class vanxkr_uvm(object):
  def __init__(self):
    self.__cache_get = []
    self.__current_get_num = 0 # 记录取出数量
    self.__cache_get_num = 100 # 缓冲池 每__cache_new_num条url存一次数据库
    self.__cache_put = []
    self.__cache_put_num = 1000 # 缓冲池 每__cache_old_num条url存一次数据库
    self._create_table()
    self._refresh_get_cache()

  def __del__(self):
    self._save_to_urls('new_urls', self.__cache_get[self.__current_get_num:])
    self._save_to_urls('old_urls', self.__cache_get[:self.__current_get_num])
    self.__cache_get = []

  def _create_table(self):
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    cursor.execute(sql_create_new_url)
    cursor.execute(sql_create_old_url)
    db.commit()
    db.close()

  def _save_to_urls(self, table_name, urls):
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    save_list = []
    for i in urls: # 筛选 urls
      sql = 'SELECT count(*) FROM %s WHERE url = "%s";' % (table_name, i)
      cursor.execute(sql) # 查询是否在 table_name 的里
      if(cursor.fetchall()[0][0]): # 存在
        continue
      else: # 不存在 就要存储
        save_list.append(i)
    if save_list:
        sql = 'INSERT INTO %s (url) VALUES ' % table_name
        for i in save_list:
          sql += '("%s"),' % i
        sql = sql[:-1] + ';'
        try:
          cursor.execute(sql) # 存入 table_name
        except Exception as e: # 出错
          print(e)
          print(sql)
        else: # 存入 table_name 成功
          db.commit()
        db.close()

  def _refresh_get_cache(self):
    self._save_to_urls('old_urls', self.__cache_get)
    self.__cache_get = []
    self._refresh_put_cache()
    sql = 'SELECT url FROM new_urls LIMIT %s;' % self.__cache_get_num
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    cursor.execute(sql)
    self.__cache_get = list(i[0] for i in cursor.fetchall())
    # 回删 已取出的数据
    sql = 'delete from new_urls where id in (select id from new_urls order by id limit %s);' % self.__cache_get_num
    cursor.execute(sql)
    db.commit()
    db.close()
    self.__current_get_num = 0 # 重置

  def _refresh_put_cache(self):
    tmp = list(set(self.__cache_get)^set(self.__cache_put)) # 取出去的 和 拿进来的 的 差集
    self.__cache_put = []
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    save_list = []
    for i in tmp: # 筛选 new_urls
      sql_old = 'SELECT count(*) FROM old_urls WHERE url = "%s";' % i
      cursor.execute(sql_old) # 查询是否在 old_urls 里
      if cursor.fetchall()[0][0]: # 存在
        continue
      else: # 不存在 就要存储
        save_list.append(i)
    db.close()
    self._save_to_urls('new_urls', save_list)

  def reset_db(self):
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    cursor.execute(sql_drop_url)
    db.close()
    self._create_table()

  def get_url(self):
    if self.__cache_get:
      ret_url = self.__cache_get[self.__current_get_num] # 取出一个新的url
      self.__current_get_num += 1
      if(self.__current_get_num == self.__cache_get_num\
        or self.__current_get_num == len(self.__cache_get)): # 是否用完了
        self._refresh_get_cache() # 更新缓冲池
      return ret_url;
    else:
      self._refresh_get_cache()
      return 'null'

  def put_urls(self, urls):
    self.__cache_put.extend(urls)
    if (0 == len(self.__cache_get) # 获取缓存区是空的
    or self.__current_get_num == len(self.__cache_get)): # 获取缓存区已用完
      self._refresh_get_cache()
    if len(self.__cache_put) >= self.__cache_put_num: # 缓冲池满了
      self._refresh_put_cache() # 清空缓冲池

  def get_history_num(self):
    sql = 'SELECT count(*) FROM old_urls;'
    db = sqlite3.connect(db_name)
    cursor = db.cursor()
    cursor.execute(sql)
    return cursor.fetchall()[0][0]

本文作者:vanxkr

本文链接:http://www.vanxkr.com/2018/8/python-url-version-manage

版权声明:本博客所有文章除特别声明外,均采用CC BY-NC-SA 3.0许可协议。转载请注明出处!

python实现约瑟夫环完整过程
0 条评论
已登录,注销 取消