有什么办法可以直接获取一个网页的正文?想想就觉得挺难,里面有这么多HTML元素,还真不知道应该提取哪里。
这里有一个原理的讲解:
XXXXXXXXXXXXXXXXXXXXXX/p/cx-extractor/这里是针对它的python实现。
001
# -*- coding=utf-8 -*-
002
import sys, os, codecs, re
003
004
reload(sys)
005
XXXXXXtdefaultencoding('utf-8')006
007
import cProfile
008
import urllib2
009
010
re_title = XXXXXXpile(r'<title>(.*?)</title>', re.I|re.U|re.S)011
re_body = XXXXXXpile(r'<body[^>]*>.*</body>', re.I|re.U|re.S)012
re_doc_type = XXXXXXpile(r'<!DOCTYPE.*?>', re.I|re.U|re.S)013
re_comment = XXXXXXpile(r'<!--.*?-->', re.I|re.U|re.S)014
re_js = XXXXXXpile(r'<script.[^>]*>.*?</script>', re.I|re.U|re.S)015
re_css = XXXXXXpile(r'<style[^>]*>.*?</style>', re.I|re.U|re.S)016
re_special = XXXXXXpile(r'&.{2,8};|&#.{2,8};', re.I|re.U|re.S)017
re_other = XXXXXXpile(r'<[^>]*>', re.I|re.U|re.S)018
019
BLOCK_HEIGHT = 3
020
THRESHOLD = 90
021
022
class TextExtract(object):
023
def __init__(self, new_html, join=True):
024
XXXXXXXml = new_html025
XXXXXXXin = join026
self.text_start = 0
027
self.text_end = 0
028
self.text_body = ''
029
self.block_len = []
030
self.title = ''
031
XXXXXXXntent = ''032
033
self.extract()
034
035
def extract(self):
036
self.extract_title()
037
self.extract_body()
038
XXXXXXXmove_tags()039
self.extract_text()
040
041
def extract_title(self):
042
m = re_XXXXXXXXarch(XXXXXXXml)043
if m:
044
self.title = XXXXoup(1)045
046
def extract_body(self):
047
m = re_XXXXXXXarch(XXXXXXXml)048
if m:
049
self.text_body = XXXXoup()050
051
def remove_tags(self):
052
self.text_body = re_doc_type.sub('', self.text_body)
053
self.text_body = re_comment.sub('', self.text_body)
054
self.text_body = re_js.sub('', self.text_body)
055
self.text_body = re_css.sub('', self.text_body)
056
self.text_body = re_special.sub('', self.text_body)
057
self.text_body = re_other.sub('', self.text_body)
058
059
def extract_text(self):
060
lines = self.text_body.split('\n')
061
line_len = len(lines)
062
for i in xrange(0,line_len,1):
063
lines[i] = re.sub(r'\s+', ' ', lines[i]).strip()
064
065
for i in xrange(1,line_len-1,1):
066
if len(lines[i]) > 0 and len(lines[i]) < 30 and 0 == len(lines[i-1]) and 0 == len(lines[i+1]):
067
lines[i] = ''
068
069
for i in xrange(0, len(lines)-BLOCK_HEIGHT, 1):
070
line_len = 0
071
for j in xrange(0, BLOCK_HEIGHT, 1):
072
line_len += len(lines[i+j])
073
self.block_len.append(line_len)
074
075
self.text_start = XXXXXXXnd_text_start(0)076
self.text_end = 0
077
078
if(0 == self.text_start):
079
XXXXXXXntent = 'nothing can find'080
else:
081
if XXXXXXXin:082
line_lens = len(lines)
083
while self.text_end < line_lens:
084
self.text_end = XXXXXXXnd_text_end(self.text_start)085
XXXXXXXntent += XXXXXXXt_text(lines)086
self.text_start = XXXXXXXnd_text_start(self.text_end)087
if 0 == self.text_start:
088
break
089
self.text_end = self.text_start
090
else:
091
self.text_end = XXXXXXXnd_text_end(self.text_start)092
XXXXXXXntent += XXXXXXXt_text(lines)093
094
def find_text_start(self, index):
095
blk_len = len(self.block_len)
096
for i in xrange(index, blk_len-1, 1):
097
if self.block_len[i] > THRESHOLD and self.block_len[i+1] > 0:
098
return i
099
return 0
100
101
def find_text_end(self, index):
102
blk_len = len(self.block_len)
103
for i in xrange(index, blk_len-1, 1):
104
if 0== self.block_len[i] and 0== self.block_len[i+1]:
105
return i
106
return blk_len-1
107
108
def get_text(self, lines):
109
str = ''
110
for i in xrange(self.text_start, self.text_end, 1):
111
str += lines[i]+'\n'
112
return str
113
114
#with codecs.open('/home/yz/download/XXXXXXml', 'r', 'utf-8') as file:115
# html = XXXXXXXad()116
# text_extract = TextExtract(html)
117
# print text_XXXXXXXXXXntent118
119
#text_extract = TextExtract('<html><title>asdfasf</title><body>\nasdfasfd</body></html>')
120
#print text_XXXXXXXXXXntent121
122
try:
123
url = 'XXXXXXXXXXXXXXXXXXXXX'124
proxied_request = urllib2.urlopen(url)
125
status_code = proxied_XXXXXXXXXXde126
mimetype = proxied_request.headers.typeheader or XXXXXXXXXXXXess_type(url)127
content = proxied_XXXXXXXXXXad()128
#encoding = proxied_request.headers['content-type'].split('charset=')[-1]
129
#ucontent = unicode(content, encoding)
130
text_extract = TextExtract(content)
131
print text_XXXXXXXXXXntent132
133
except XXXXXXXXXXTPError as e:134
print e
转自 开源中国 余争 发布于 2013年08月12日 12时