Tuesday, January 29, 2019

T-SQL Function to remove HTML tags

I needed to clean out some html tags from a column.  I could not find a function already out there so I crafted one from my google-fu findings.

So here it is:

IF OBJECT_ID (N'dbo.StripHTML', N'FN') IS NOT NULL 
    DROP FUNCTION StripHTML; 
GO 
CREATE FUNCTION dbo.StripHTML(@TextToClean nvarchar(max)) 
RETURNS NVARCHAR(max)
AS
BEGIN
--This function cleans out html tags and some special html characters
--20190129
--author: lyle.spencer@gmail.com
--original code: https://www.sqlservercentral.com/Forums/932589/Extracting-text-from-html-stored-in-SQL-table

-- To get the list of Tags removed....
Declare @TextToCleanTagNameTable Table(Tag nvarchar(max))
Declare @TextToCleanCharI1 int  -- Used to locate '<'
Declare @TextToCleanCharI2 int  -- Used to locate '>' to get Tag Name
Declare @TextToCleanCharICheck int  -- Used to Check for Tag Name
Declare @TextToCleanEndTagName nvarchar(1000)

Declare @TextToCleanStartPos int
Set @TextToCleanStartPos = 0

While 1=1
begin
   Set @TextToCleanCharI1 = CharIndex('<',@TextToClean,@TextToCleanStartPos) 
   if @TextToCleanCharI1 > 0
   begin
  -- Get the Tag Name, if it is a HTML Tag
  Set @TextToCleanCharI2 = CharIndex('>',@TextToClean,@TextToCleanCharI1 + 1)
  if @TextToCleanCharI2 > 0
  begin
-- Check <>
if @TextToCleanCharI2 = @TextToCleanCharI1 + 1
begin
Set @TextToCleanStartPos = @TextToCleanCharI2 + 1
end
else
begin
Set @TextToCleanCharICheck = CharIndex('<',@TextToClean,@TextToCleanCharI1 + 1)
if (@TextToCleanCharICheck < @TextToCleanCharI2)
begin           
   -- Last < found was not start of tag. Just < character / sign
   if @TextToCleanCharICheck > 0
  Set @TextToCleanStartPos = @TextToCleanCharICheck
   else
   begin
  -- @TextToCleanCharICheck = 0
  -- Its a Last TAG...
  Set @TextToCleanEndTagName = SubString(@TextToClean,@TextToCleanCharI1,(@TextToCleanCharI2-@TextToCleanCharI1) + 1)
  if Not Exists(Select 1 from @TextToCleanTagNameTable where Tag = @TextToCleanEndTagName)
Set @TextToClean = Replace(@TextToClean,@TextToCleanEndTagName,'')
Set @TextToCleanStartPos = @TextToCleanCharI2
   end

   if @TextToCleanStartPos = Len(@TextToClean)
   begin           
  --Select @TextToCleanStartPos , Len(@TextToClean),@TextToCleanCharICheck,@TextToCleanCharI1
  --print 'Over Here....'
  BREAK
   end
end
else
begin
   -- ITS A TAG....
   Set @TextToCleanEndTagName = SubString(@TextToClean,@TextToCleanCharI1,(@TextToCleanCharI2-@TextToCleanCharI1) + 1)             
   if Not Exists(Select 1 from @TextToCleanTagNameTable where Tag = @TextToCleanEndTagName)
   begin
  Set @TextToClean = Replace(@TextToClean,@TextToCleanEndTagName,'')
   end

   --Set @TextToCleanStartPos = @TextToCleanCharI2 + 1           
   Set @TextToCleanStartPos = @TextToCleanCharI1
   if @TextToCleanStartPos > Len(@TextToClean)
   begin
  --print 'Here....'
  BREAK
   end
end
end
  end
  else
BREAK  -- No Tag
   end 
   else
  BREAK
end
RETURN @TextToClean
END;
GO

--test me
Select dbo.stripHTML('<test1> test2')

No comments:

Post a Comment