rabin 2 月之前
父节点
当前提交
b218afa645
共有 2 个文件被更改,包括 3 次插入1 次删除
  1. 1 0
      requirements.txt
  2. 2 1
      service/extract/docs/pdf.py

+ 1 - 0
requirements.txt

@@ -1,4 +1,5 @@
 demeter-lib
+tornado==6.2
 redis
 requests
 gevent

+ 2 - 1
service/extract/docs/pdf.py

@@ -43,7 +43,8 @@ class Pdf(Base):
             for i, b in enumerate(blocks):
                 y_top = b["bbox"][1]
                 y_bottom = b["bbox"][3]
-                if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
+                block_height = y_bottom - y_top
+                if (y_top < page_height * 0.02 or y_bottom > page_height * 0.98) and block_height < 20:
                     continue
 
                 if b['type'] == 0: